From 04be68f3a94ecf7975a5f780f147da973757dae3 Mon Sep 17 00:00:00 2001 From: Xiao-Yong Jin Date: Fri, 8 Nov 2024 16:30:57 -0600 Subject: [PATCH 01/23] simd: fix for nim 2.2.0 --- src/simd.nim | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/simd.nim b/src/simd.nim index bde7919..0842aea 100644 --- a/src/simd.nim +++ b/src/simd.nim @@ -27,20 +27,20 @@ when defined(SSE) or defined(AVX) or defined(AVX512): when true: when not declared(SimdS16): when declared(SimdS8): - msa(SimdS16, 2, SimdS8[]) + msa(SimdS16, 2, `[]`(SimdS8)) elif declared(SimdS4): - msa(SimdS16, 4, SimdS4[]) + msa(SimdS16, 4, `[]`(SimdS4)) elif declared(SimdS2): - msa(SimdS16, 8, SimdS2[]) + msa(SimdS16, 8, `[]`(SimdS2)) else: msa(SimdS16, 16, float32) when not declared(SimdD16): when declared(SimdD8): - msa(SimdD16, 2, `[]`SimdD8) + msa(SimdD16, 2, `[]`(SimdD8)) elif declared(SimdD4): - msa(SimdD16, 4, SimdD4[]) + msa(SimdD16, 4, `[]`(SimdD4)) elif declared(SimdD2): - msa(SimdD16, 8, SimdD2[]) + msa(SimdD16, 8, `[]`(SimdD2)) else: msa(SimdD16, 16, float64) when not declared(SimdS16Obj): @@ -52,16 +52,16 @@ when true: when true: when not declared(SimdS8): when declared(SimdS4): - msa(SimdS8, 2, SimdS4[]) + msa(SimdS8, 2, `[]`(SimdS4)) elif declared(SimdS2): - msa(SimdS8, 4, SimdS2[]) + msa(SimdS8, 4, `[]`(SimdS2)) else: msa(SimdS8, 8, float32) when not declared(SimdD8): when declared(SimdD4): - msa(SimdD8, 2, SimdD4[]) + msa(SimdD8, 2, `[]`(SimdD4)) elif declared(SimdD2): - msa(SimdD8, 4, SimdD2[]) + msa(SimdD8, 4, `[]`(SimdD2)) else: msa(SimdD8, 8, float64) when not declared(SimdS8Obj): @@ -73,12 +73,12 @@ when true: when true: when not declared(SimdS4): when declared(SimdS2): - msa(SimdS4, 2, SimdS2[]) + msa(SimdS4, 2, `[]`(SimdS2)) else: msa(SimdS4, 4, float32) when not declared(SimdD4): when declared(SimdD2): - msa(SimdD4, 2, SimdD2[]) + msa(SimdD4, 2, `[]`(SimdD2)) else: msa(SimdD4, 4, float64) when not declared(SimdS4Obj): From 0508c8b12facfde71e59d86a2649c96b00da1ebd Mon Sep 17 00:00:00 2001 From: James Osborn Date: Sun, 10 Nov 2024 22:16:40 -0600 Subject: [PATCH 02/23] fix rpath for mac --- src/comms/qmp.nim | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/comms/qmp.nim b/src/comms/qmp.nim index 6304c4b..dc8ac6f 100644 --- a/src/comms/qmp.nim +++ b/src/comms/qmp.nim @@ -5,7 +5,7 @@ when existsEnv("QMPDIR"): else: const qmpDir {.strDefine.} = getHomeDir() & "lqcd/install/qmp" const qmpPassC = "-I" & qmpDir & "/include" -const qmpPassL* = "-L" & qmpDir & "/lib -lqmp -Wl,-rpath=" & qmpDir & "/lib" +const qmpPassL* = "-L" & qmpDir & "/lib -lqmp -Wl,-rpath," & qmpDir & "/lib" static: echo "Using QMP: ", qmpDir echo "QMP compile flags: ", qmpPassC From db850d40ced27286fb6cfae766bf86e6ee567a62 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Tue, 12 Nov 2024 20:18:09 -0600 Subject: [PATCH 03/23] fix lapack tests on mac --- src/eigens/lapack.h | 12 ++++++++++++ src/eigens/lapack.nim | 17 ++++++++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/src/eigens/lapack.h b/src/eigens/lapack.h index 5524d35..ba0781a 100644 --- a/src/eigens/lapack.h +++ b/src/eigens/lapack.h @@ -7,6 +7,18 @@ typedef double doublereal; r f a; \ r U(f) a +L(void, cgemm, (const char *transa, const char *transb, + const int *m, const int *n, const int *k, + const float *alpha, const float *a, + const int *lda, const float *b, const int *ldb, + const float *beta, float *c, const int *ldc)); + +L(void, dgemm, (const char *transa, const char *transb, + const int *m, const int *n, const int *k, + const double *alpha, const double *a, + const int *lda, const double *b, const int *ldb, + const double *beta, double *c, const int *ldc)); + L(void, dsterf, (integer *n, doublereal *d, doublereal *e, integer *info)); L(void, dgetrf, (integer *m, integer *n, doublereal *a, integer * lda, integer *ipiv, integer *info)); diff --git a/src/eigens/lapack.nim b/src/eigens/lapack.nim index baf0d0c..3a32ef9 100644 --- a/src/eigens/lapack.nim +++ b/src/eigens/lapack.nim @@ -3,7 +3,8 @@ const lapackLib {.strdefine.} = "-llapack -lblas" #const lapackLib {.strdefine.} = "/usr/lib/lapack/liblapack.a -lblas -lgfortran" #const lapackLib {.strdefine.} = "-L/usr/lib/lapack -llapack" {.passL: lapackLib.} -{.pragma: lapack, header: hdr.} +#{.pragma: lapack, header: hdr.} +{.pragma: lapack.} type fint* = cint @@ -102,3 +103,17 @@ proc dbdsvdx*(uplo: cstring, jobz: cstring, range: cstring, n: ptr fint, proc dlasq1*(n: ptr fint; d: ptr float64; e: ptr float64; work: ptr float64; info: ptr fint) {.lapack, importc: "dlasq1_".} + +when isMainModule: + template toPtrInt32(x: int): ptr int32 = + var t = x.int32 + addr t + template toPtrScomplex(x: int): ptr scomplex = + var t = scomplex(re: x.float32, im: 0'f32) + addr t + template `&`(x: int): untyped = toPtrInt32(x) + template `&&`(x: int): untyped = toPtrScomplex(x) + var c,a,b: ptr scomplex + var cr,cc,bc: int + + cgemm("C","N", &cc,&cr,&bc, &&1, b,&bc, a,&bc, &&0, c,&cc) From b81fec058fbad6df39781c16c3dc6c58dd02a91b Mon Sep 17 00:00:00 2001 From: James Osborn Date: Thu, 14 Nov 2024 11:01:18 -0600 Subject: [PATCH 04/23] update Nim versions in CI --- .github/workflows/test.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 8cd40fd..b2ea5d7 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -2,16 +2,16 @@ name: test on: pull_request: push: - branches: - - 'devel' - - 'master' +# branches: +# - 'devel' +# - 'master' jobs: build: strategy: matrix: mpi-impl: [openmpi, mpich] - nim-branch: [version-1-6, version-2-0, devel] + nim-branch: [version-2-0, version-2-2, devel] fuel-compat: [0, 1] fail-fast: false name: nim-${{ matrix.nim-branch }}-${{ matrix.mpi-impl }}-FUELCompat:${{ matrix.fuel-compat }} From ffaa9510ca08574dd49d8c4c97e348f0fba04388 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Thu, 14 Nov 2024 13:31:50 -0600 Subject: [PATCH 05/23] merge some changes from wrap branch add experimental halo support --- src/bench/benchStagProp.nim | 6 +- src/comms/halo.nim | 441 ++++++++++++++++++++ src/experimental/pgag.nim | 785 ++++++++++++++++++++++++++++++++++++ src/gauge/hypsmear2.nim | 544 +++++++++++++++++++++++++ src/gauge/staples.nim | 48 +++ 5 files changed, 1823 insertions(+), 1 deletion(-) create mode 100644 src/comms/halo.nim create mode 100644 src/experimental/pgag.nim create mode 100644 src/gauge/hypsmear2.nim diff --git a/src/bench/benchStagProp.nim b/src/bench/benchStagProp.nim index 9580ffc..2ac4f8f 100644 --- a/src/bench/benchStagProp.nim +++ b/src/bench/benchStagProp.nim @@ -18,8 +18,11 @@ var v1 = lo.ColorVector() var v2 = lo.ColorVector() var r = lo.ColorVector() var rs = newRNGField(RngMilc6, lo, intParam("seed", 987654321).uint64) +let warm0 = 0.29 + 500.0/lo.physVol.float +var warm = floatParam("warm", warm0) threads: - g.random rs + #g.random rs + g.warm warm, rs g.setBC g.stagPhase v1 := 0 @@ -67,6 +70,7 @@ for i in 0..3: g3[2*i] = g[i] g3[2*i+1] = lo.ColorMatrix() g3[2*i+1].randomSU rs + g3[2*i+1] *= 0.1 var s3 = newStag3(g3) #s3.D(v2, v1, m) s3.solve(v2, v1, mass, sp) diff --git a/src/comms/halo.nim b/src/comms/halo.nim new file mode 100644 index 0000000..ba2cc95 --- /dev/null +++ b/src/comms/halo.nim @@ -0,0 +1,441 @@ +import qex, base/hyper, comms/gather +getOptimPragmas() + +type + HaloLayout*[L] = ref object + lo*: L # layout + outerExt*: seq[int32] # extended outer lattice size + offset*: seq[int32] # offset of outer lattice in extended outer + lex*: seq[int32] # outerExt lex index of extended sites + index*: seq[int32] # index of extended site for given lex index + neighborFwd*: seq[seq[int32]] # fwd neighbor for extended outer lattice + neighborBck*: seq[seq[int32]] # bck neighbor for extended outer lattice + nOut*: int # sites in outer lattice + nExt*: int # sites in extended outer lattice + Halo*[L,F,T] = ref object + layout*: HaloLayout[L] + field*: F + halo*: alignedMem[T] + nOut*: int # sites in outer lattice + nExt*: int # sites in extended outer lattice + HaloMap*[L] = ref object + layout*: HaloLayout[L] + offsets*: seq[seq[int32]] # offsets of outer sites needed + gather*: GatherMap # gather map to fetch needed halo sites + gatherRev*: GatherMap # gather map to send halo sites back + GatherHalo*[F,T;Rev:static bool] = object + src: F + dest: ptr UncheckedArray[T] + elemsize: int + vlen: int + +proc norm2*(h: Halo): auto = + var s = h.halo[0].norm2 + for i in 1..= hi[i]: + result = false + break + +proc init[T](s: var seq[T], x: openArray[SomeNumber]) = + s.newSeq(x.len) + for i in 0..= hl.lo.physGeom[l]: y[l] -= int32 hl.lo.physGeom[l] + let ri = hl.lo.rankIndex(y) + let didx = vlen*(i-hl.nOut)+k + rl.add RecvList(didx: int32 didx, srank: int32 ri.rank, sidx: int32 ri.index) + sl.add SendList(sidx: int32 didx, drank: int32 ri.rank, didx: int32 ri.index) + toc("RecvList") + result.gather = c.makeGatherMap(rl) + result.gatherRev = c.makeGatherMap(sl) + toc("makeGatherMap") +proc makeHaloMap*[L,N](hl: HaloLayout[L], c: Comm, offsets: seq[array[N,SomeInteger]]): HaloMap[L] = + var o = newSeq[seq[int32]](offsets.len) + for i in 0.. 0: + h.map.neighborFwd[mu][i] + else: + h.map.neighborBck[mu][i] + +when isMainModule: + qexInit() + tic("main") + var defaultLat = @[4,4,4,4] + defaultSetup() + let nd = lo.nDim + var seed = 987654321'u + #var rng = newRngField(lo, RngMilc6, seed) + var rng = newRngField(lo, MRG32k3a, seed) + g.gaussian rng + toc "gaussian" + #var r0 = lo.Real() + #var cv1 = lo.ColorVector() + #var cv2 = lo.ColorVector() + echo 6.0 * g.plaq + toc "plaq" + #cv0.gaussian rng + resetTimers() + + proc testPlaq = + tic("testPlaq") + #let hl = lo.makeHaloLayout([1,1,1,1],[1,1,1,1]) + let hl = lo.makeHaloLayout([1,1,1,1],[0,0,0,0]) + toc "makeHaloLayout" + type HM = HaloMap[type lo] + let comm = getDefaultComm() + var hm = newSeq[HM](nd) + for mu in 0.. [0,1,0,0],[0,0,1,0],[0,0,0,1], +# [-1,0,0,0],[0,-1,0,0],[0,0,-1,0],[0,0,0,-1], +# [-1,1,0,0],[-1,0,1,0],[-1,0,0,1] + +# [1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,1], +# [-1,0,0,0],[0,-1,0,0],[0,0,-1,0],[0,0,0,-1], +# [-1,1,0,0],[-1,0,1,0],[-1,0,0,1], +# [1,-1,0,0],[0,-1,1,0],[0,-1,0,1], +# [1,0,-1,0],[0,1,-1,0],[0,0,-1,1], +# [1,0,0,-1],[0,1,0,-1],[0,0,1,-1] + diff --git a/src/experimental/pgag.nim b/src/experimental/pgag.nim new file mode 100644 index 0000000..02c2b03 --- /dev/null +++ b/src/experimental/pgag.nim @@ -0,0 +1,785 @@ +import qex, gauge, physics/[qcdTypes,stagSolve] +import times, macros, algorithm +import hmc/metropolis +import observables/sources +import quda/qudaWrapper +import hmc/agradOps + +qexinit() + +let + lat = intSeqParam("lat", @[8,8,8,8]) + beta = floatParam("beta", 5.6) + tau0 = floatParam("tau", 0.04) + lam0 = floatParam("lam", 0.19) + sig0 = floatParam("sig", 0.3) + rho0 = floatParam("rho", 0.2) + theta0 = floatParam("theta", 0.2) + ups0 = floatParam("ups", 0.2) + upit = intParam("upit", 1) + gsteps0 = intParam("gsteps", 32) + nwarm = intParam("nwarm", 10) + trajs = intParam("trajs", 10) + seed0 = defaultComm.broadcast(int(1000*epochTime())) + seed = uint64 intParam("seed", seed0) + infn = stringParam("infn", "") + outfn = stringParam("outfn", "") +var + tau = tau0 + lam = lam0 + sig = sig0 + rho = rho0 + ups = ups0 + theta = theta0 + gsteps = gsteps0 + +macro echoparam(x: typed): untyped = + let n = x.repr + result = quote do: + echo `n`, ": ", `x` + +echoparam(beta) +echoparam(tau) +echoparam(lam) +echoparam(sig) +echoparam(rho) +echoparam(theta) +echoparam(ups) +echoparam(upit) +echoparam(gsteps) +echoparam(nwarm) +echoparam(trajs) +echoparam(seed) +echoparam(infn) +echoparam(outfn) + +let + gc = GaugeActionCoeffs(plaq: beta, adjplaq: 0) + lo = lat.newLayout + vol = lo.physVol + +var r = lo.newRNGField(RngMilc6, seed) +var R:RngMilc6 # global RNG +R.seed(seed, 987654321) + +var g = lo.newgauge +type + Gauge = typeof(g) + GaugeV = AgVar[Gauge] +template newGaugeV(c: AgTape, x: Gauge): auto = newGaugeFV(c, x) +case infn +of "": + g.random r + #g.unit +of "hot": + g.random r +of "cold": + g.unit +else: + echo "Loading gauge field from file: ", infn + let err = g.loadGauge infn + +echo "plaq: ", 6.0*g.plaq +echo "gaugeAction2: ", g.gaugeAction2 gc +echo "actionA: ", gc.actionA g + +var + p = lo.newgauge + f = lo.newgauge + g0 = lo.newgauge + +proc norm2*(x: seq): float = + for i in 0.. m.hOld: + costg = m.pAccept * (costg - vtau.obj * params[i].grad) + result.add costg + +proc checkGrad(m: Met) = + #let tg = vtau.grad + #let f0 = vtau.obj * exp(m.hOld-m.hNew) + let eps = 1e-7 + var gs = newSeq[float](0) + for i in 0.. 0: + echo "Starting warmups" + #gsteps = 60 + #fsteps = 40 + #setupMDx() + alwaysAccept = true + for n in 1..nwarm: + m.update + m.clearStats + +echo "Starting HMC" +#gsteps = gsteps0 +#fsteps = fsteps0 +#setupMD5() +alwaysAccept = false +gutime = 0.0 +gftime = 0.0 +#fftime = 0.0 +block: + tic() + for n in 1..trajs: + echo "Starting trajectory: ", n + tic() + m.update + getGrad(m) + if upit > 0: + if n mod upit == 0: + updateParams(0.001) + let tup = getElapsedTime() + measure() + let ttot = getElapsedTime() + echo "End trajectory update: ", tup, " measure: ", ttot-tup, " total: ", ttot + let et = getElapsedTime() + toc() + echo "HMC time: ", et + #let at = gutime + gftime + fftime + #echo &"gu: {gutime} gf: {gftime} ff: {fftime} ot: {et-at} tt: {et}" + +if outfn != "": + echo "Saving gauge field to file: ", outfn + let err = g.saveGauge outfn + +#echoTimers() +qexfinalize() diff --git a/src/gauge/hypsmear2.nim b/src/gauge/hypsmear2.nim new file mode 100644 index 0000000..2dda26d --- /dev/null +++ b/src/gauge/hypsmear2.nim @@ -0,0 +1,544 @@ +import base, layout, gauge, hypsmear, comms/halo, bitops +getOptimPragmas() + +#nflop = 61632.0 +# l1[mu,nu] : g[mu](0,nu,-nu) g[nu](0,mu)(0,-nu) +# l2[mu,nu] : g[mu](0) l1[mu,b](a,-a) l1[a,b](0,mu)(0,-a) +# fl[mu] : g[mu](0) l2[mu,nu](nu,-nu) l2[nu,mu](0,mu)(0,-nu) +# l2[mu,nu] : g[mu](0)+(0,b,-b)(a,-a) +# g[a](0,mu)(0,-a)(0,b,-b) +# g[b](0,mu)(0,-b)(0,a,-a) +# g[0]: (0,+-1,+-1,+-1) (-1,1,+-1,+-1) (-1,+-1,1,+-1) (-1,+-1,+-1,1) + +type + HypTemps*[L,F,T] = object + gf: array[4,F] + hl: HaloLayout[L] + hm: HaloMap[L] + l1x: FieldArray[L.V,T] + l2x: FieldArray[L.V,T] + l1: FieldArray[L.V,T] + l2: FieldArray[L.V,T] + flx: FieldArray[L.V,T] + hgf: array[4,Halo[L,F,T]] + h1x: array[4,array[4,Halo[L,F,T]]] + h2x: array[4,array[4,Halo[L,F,T]]] + h1: array[4,array[4,Halo[L,F,T]]] + h2: array[4,array[4,Halo[L,F,T]]] + info: PerfInfo + +template proj(x,y: auto) = x.projectU y +proc proj[T](x: T): T {.noInit,alwaysInline.} = result.proj x +template projDeriv(r: auto, x: auto, c: auto) = r.projectUDeriv(x, c) +template projDeriv(r: auto, u, x: auto, c: auto) = r.projectUDeriv(u, x, c) + +proc init*[L,F,T,G](ht: var HypTemps[L,F,T], gf: G, comm: Comm) = + tic "HypTemps init" + static: doAssert(type(gf[0]) is F) + static: doAssert(type(gf[0].l) is L) + static: doAssert(type(gf[0][0]) is T) + let lo = gf[0].l + doAssert(lo.nDim == 4) + ht.hl = lo.makeHaloLayout([1,1,1,1],[1,1,1,1]) + toc "makeHaloLayout" + ht.l1x = newFieldArray2(lo,F,[4,4],mu!=nu) + ht.l2x = newFieldArray2(lo,F,[4,4],mu!=nu) + ht.l1 = newFieldArray2(lo,F,[4,4],mu!=nu) + ht.l2 = newFieldArray2(lo,F,[4,4],mu!=nu) + ht.flx = newFieldArray(lo,F,4) + for mu in 0..<4: + ht.gf[mu] = gf[mu] + ht.hgf[mu] = makeHalo(ht.hl, gf[mu]) + for nu in 0..<4: + if nu == mu: continue + ht.h1x[mu][nu] = makeHalo(ht.hl, ht.l1x[mu,nu]) + ht.h2x[mu][nu] = makeHalo(ht.hl, ht.l2x[mu,nu]) + ht.h1[mu][nu] = makeHalo(ht.hl, ht.l1[mu,nu]) + ht.h2[mu][nu] = makeHalo(ht.hl, ht.l2[mu,nu]) + toc "makeHalo" + const hmtype = 0 + when hmtype == 0: + var offsets = newSeq[array[4,int32]](0) + for k in 0..15: # use all corners for simplicity + var t = [1'i32,1,1,1] + for i in 0..<4: + if k.testBit i: t[i] = -1 + offsets.add t + ht.hm = ht.hl.makeHaloMap(comm, offsets) + elif hmtype == 1: + var offsets = newSeq[array[4,int32]](0) + for k in 0..15: + var t = [1'i32,1,1,1] + for i in 0..<4: + if k.testBit i: t[i] = -1 + if k != 0 and k != 15: offsets.add t + for mu in 0..<4: + if t[mu] == 1: + t[mu] = 0 + offsets.add t + t[mu] = 1 + ht.hm = hl.makeHaloMap(comm, offsets) + toc "makeHaloMap" +proc newHypTemps*[G](gf: G): auto = + type + L = type gf[0].l + F = type gf[0] + T = type gf[0][0] + var ht: HypTemps[L,F,T] + let comm = getDefaultComm() + ht.init(gf, comm) + ht + +proc symstaple(s: var auto, a: float, nu0,mu0,nu1,nu2,mu1,nu3: auto) = + let t0 = mu0 * nu1.adj + let t1 = nu0 * t0 + s += a*t1 + let t2 = mu1 * nu3 + let t3 = nu2.adj * t2 + s += a*t3 + +proc symstaple(s: var auto, a: float, x,y: auto, i,fnu,fmu,bnu,fmubnu: SomeInteger, + p: static bool = false) = + when p: + template prj(m: auto): auto = proj(m) + else: + template prj(m: auto): auto = m + let xi = prj x[i] + let yfnu = prj y[fnu] + let xfmu = prj x[fmu] + let xbnu = prj x[bnu] + let ybnu = prj y[bnu] + let xfmubnu = prj x[fmubnu] + var t = xi * yfnu * xfmu.adj + t += xbnu.adj * ybnu * xfmubnu + s += a*t +template symstaplep(s: var auto, a: float, x,y: auto, i,fnu,fmu,bnu,fmubnu: SomeInteger) = + symstaple(s, a, x,y, i,fnu,fmu,bnu,fmubnu, true) + +proc symderiv(s: var auto, x,y,cx,cy: auto, i,fnu,fmu,bnu,fmubnu: SomeInteger, + p: static bool = false): int = + mixin projectUflops + when p: + template prj(m: auto): auto = proj(m) + else: + template prj(m: auto): auto = m + if fnu>=0 and fmu>=0: + let xi = prj x[i] + let yfnu = prj y[fnu] + let xfmu = prj x[fmu] + s += cx[i] * yfnu * xfmu.adj + s += xi * cy[fnu] * xfmu.adj + s += xi * yfnu * cx[fmu].adj + result += 3*projectUflops(3) + 3*(18+2*3*66) + if bnu>=0 and fmubnu>=0: + let xbnu = prj x[bnu] + let ybnu = prj y[bnu] + let xfmubnu = prj x[fmubnu] + s += cx[bnu].adj * ybnu * xfmubnu + s += xbnu.adj * cy[bnu] * xfmubnu + s += xbnu.adj * ybnu * cx[fmubnu] + result += 3*projectUflops(3) + 3*(18+2*3*66) +template symderivp(s: var auto, x,y,cx,cy: auto, i,fnu,fmu,bnu,fmubnu: SomeInteger): int = + symderiv(s, x,y,cx,cy, i,fnu,fmu,bnu,fmubnu, true) + +proc symderiv3(s: var auto, x,y,cx,cy: auto, i,fnu,fmu,bnu,fmubnu,nOut: SomeInteger, + p: static bool = false): int = + mixin projectUflops + when p: + template prj(m: auto): auto = proj(m) + else: + template prj(m: auto): auto = m + if fnu>=0 and fmu>=0: + let xi = prj x[i] + let yfnu = prj y[fnu] + let xfmu = prj x[fmu] + result += 3*projectUflops(3) + if i=0 and fmubnu>=0: + let xbnu = prj x[bnu] + let ybnu = prj y[bnu] + result += 2*projectUflops(3) + if bnu=0 and fmu>=0: + let xi = prj x[i] + let yfnu = prj y[fnu] + let xfmu = prj x[fmu] + s += cx[i] * yfnu * xfmu.adj + s += xi * cy[fnu] * xfmu.adj + s += xi * yfnu * cx[fmu].adj + inc result, 3 + if bnu>=0 and fmubnu>=0: + let xbnu = prj x[bnu] + let ybnu = prj y[bnu] + let xfmubnu = prj x[fmubnu] + s += cx[bnu].adj * ybnu * xfmubnu + s += xbnu.adj * cy[bnu] * xfmubnu + s += xbnu.adj * ybnu * cx[fmubnu] + inc result, 3 + +#[ +# x: side links, y: middle links +# snu: c xmu ynu' +# snu: y xmu cnu' +# snu: c' x-mu ynu-mu +# snu: y' x-mu cnu-mu +# smu: x cnu xmu' +# smu: x-nu' c-nu xmu-nu +# x, xmu, x-mu, x-nu, xmu-nu; y, ynu, ynu-mu; c, cnu, c-nu +proc symderiv(f: var auto, a: float, nu0,mu0,nu1,nu2,mu1,nu3,c,cnu: auto) = + let t0 = mu0 * nu1.adj + let t1 = nu0 * t0 + f += a*t1 + let t2 = mu1 * nu3 + let t3 = nu2.adj * t2 + f += a*t3 +]# + +proc smear*[L,F,T,G](ht: HypTemps[L,F,T], coef: HypCoefs, fl: G) = + tic "HypTemps smear" + static: doAssert(type(fl[0]) is F) + static: doAssert(type(fl[0].l) is L) + static: doAssert(type(fl[0][0]) is T) + let lo = fl[0].l + doAssert(lo.nDim == 4) + let comm = getDefaultComm() + for mu in 0..<4: + ht.hgf[mu].halo := 0 + ht.hgf[mu].update ht.hm, comm + toc "update" + const V = L.V + var + gf = ht.gf + hl = ht.hl + l1x = ht.l1x + l2x = ht.l2x + flx = ht.flx + hgf = ht.hgf + h1x = ht.h1x + h2x = ht.h2x + h1 = ht.h1 + h2 = ht.h2 + let + alp1 = coef.alpha1 / 2.0 + alp2 = coef.alpha2 / 4.0 + alp3 = coef.alpha3 / 6.0 + ma1 = 1 - coef.alpha1 + ma2 = 1 - coef.alpha2 + ma3 = 1 - coef.alpha3 + let nhalo = hl.nExt + threads: + let nc = gf[0][0].nrows + let staplesFlops = float((4*(6*nc+2*(nc-1))+4*2)*nc*nc*V) + let siteFlops1 = staplesFlops + float(2*nc*nc*V) + let siteFlops2 = 2*staplesFlops+float((2*nc*nc+12*projectUflops(nc))*V) + let siteFlops3 = 3*staplesFlops+float((2*nc*nc+19*projectUflops(nc))*V) + var flops = 0.0 + # h1x[mu][nu] mu,nu,-nu,mu-nu + tfor i, 0.. 0: + hl2[mu][nu][i].projDeriv(h2x[mu][nu][i], hl2[mu][nu][i]) + hf[mu][i] += ma2 * hl2[mu][nu][i] + hl2[mu][nu][i] *= alp2 + flops += V*(flp+54+2*projectUflops(nc)) # guess for projDeriv + flops.threadSum + toc "3", flops=flops + flops = 0 + tfor i, 0.. 0: + hl1[mu][nu][i].projDeriv(h1x[mu][nu][i], hl1[mu][nu][i]) + hf[mu][i] += ma1 * hl1[mu][nu][i] + hl1[mu][nu][i] *= alp1 + #t.projDeriv(h1x[mu][nu][i], t) + #hf[mu][i] += ma1 * t + #hl1[mu][nu][i] = alp1 * t + flops += V*(flp+54+2*projectUflops(nc)) # guess for projDeriv + flops.threadSum + toc "2", flops=flops + flops = 0 + tfor i, 0.. Date: Thu, 14 Nov 2024 17:43:01 -0600 Subject: [PATCH 06/23] fix Nim warnings --- src/base/basicOps.nim | 1 + src/base/profile.nim | 2 +- src/examples/heatbath2dclockt.nim | 10 +++++----- src/examples/heatbath2dclocktb.nim | 12 ++++++------ src/examples/heatbath2dxy.nim | 12 ++++++------ src/examples/heatbath2dxyp.nim | 18 +++++++++--------- src/examples/heatbath2dxypt.nim | 2 +- src/examples/heatbath2dxyptb.nim | 16 ++++++++-------- src/examples/heatbath2dxyt.nim | 10 +++++----- src/examples/heatbath2dxytb.nim | 16 ++++++++-------- src/examples/modeigs1.nim | 2 +- src/examples/staghmc.nim | 8 ++++---- src/examples/staghmc_h.nim | 8 ++++---- src/examples/staghmc_s.nim | 6 +++--- src/io/modfile.nim | 10 +++++----- src/physics/wilsonD.nim | 20 ++++++++++---------- 16 files changed, 77 insertions(+), 76 deletions(-) diff --git a/src/base/basicOps.nim b/src/base/basicOps.nim index af0fc96..5498451 100644 --- a/src/base/basicOps.nim +++ b/src/base/basicOps.nim @@ -256,6 +256,7 @@ template setUnopT*(op,fun,t1,t2: untyped) {.dirty.} = rSetUnopT template setBinopP*(op,fun,t1,t2,t3: untyped) {.dirty.} = + getOptimPragmas() #template op*(x: typedesc[t1]; y: typedesc[t2]): typedesc = t3 proc op*(x: t1; y: t2): auto {.alwaysInline,noInit.} = var r{.noInit.}: t3 diff --git a/src/base/profile.nim b/src/base/profile.nim index 7197cbf..f577bec 100644 --- a/src/base/profile.nim +++ b/src/base/profile.nim @@ -822,7 +822,7 @@ proc makeHotspotTable(lrti: List[RTInfoObj]): tuple[ns:int64,oh:int64] = t.children.add ri.children[i] do: # loc not found hs[loc] = ri - let tot = makeHotSpotTable(List[RTInfoObj](ri.children)) + #let tot = makeHotSpotTable(List[RTInfoObj](ri.children)) return (nstot, ohtot) proc echoHotspots* = diff --git a/src/examples/heatbath2dclockt.nim b/src/examples/heatbath2dclockt.nim index ed7d8cb..ba7997f 100644 --- a/src/examples/heatbath2dclockt.nim +++ b/src/examples/heatbath2dclockt.nim @@ -2,7 +2,7 @@ import qex import gauge, physics/qcdTypes import os, strutils, times -proc sumEnergy(fr,fi:any, J, h:any, g:any, p:seq[float], sf,sb:any) = +proc sumEnergy(fr,fi:auto, J, h:auto, g:auto, p:seq[float], sf,sb:auto) = fr := 0 fi := 0 for nu in 0..`(x) template TXT(x: untyped): untyped = xmltree.newText(x) diff --git a/src/examples/staghmc.nim b/src/examples/staghmc.nim index dc1746e..d040e46 100644 --- a/src/examples/staghmc.nim +++ b/src/examples/staghmc.nim @@ -134,9 +134,9 @@ proc mdvf(t: float) = p[mu] -= s*f[mu] toc("mdvf") -proc mdvf2(t: float) = - mdv(t) - mdvf(t) +#proc mdvf2(t: float) = +# mdv(t) +# mdvf(t) # For force gradient update #const useFG = true @@ -175,7 +175,7 @@ proc mdvAll(t: openarray[float]) = # For now, just do it separately. if t[0] != 0: mdv t[0] if t[1] != 0: mdvf t[1] -proc mdvAllfga(ts,gs:openarray[float]) = +proc mdvAllfga(ts,gs:openarray[float]) {.used.} = # TODO: actually share computation. # For now, just do it separately. let diff --git a/src/examples/staghmc_h.nim b/src/examples/staghmc_h.nim index 2643c1a..8af3801 100644 --- a/src/examples/staghmc_h.nim +++ b/src/examples/staghmc_h.nim @@ -80,7 +80,7 @@ template pnorm2(p2:float) = threadMaster: p2 = p2t g.rephase -proc gaction(g:any, f2:seq[float], p2:float):auto = +proc gaction(g:auto, f2:seq[float], p2:float):auto = let ga = gc.actionA g fa = f2.mapit(0.5*it) @@ -100,14 +100,14 @@ template faction(fa:seq[float]) = threadMaster: fa[i] = psi2 g.rephase -proc olf(f: var any, v1: any, v2: any) = +proc olf(f: var auto, v1: auto, v2: auto) = var t {.noInit.}: type(f) for i in 0.. Date: Thu, 14 Nov 2024 21:24:12 -0600 Subject: [PATCH 07/23] merge some Nim warning fixes from wrap branch --- configure | 3 ++- src/examples/testlapl1.nim | 2 +- src/io/crc32.nim | 2 +- src/io/readerQiolite.nim | 2 +- src/io/writerQiolite.nim | 4 ++-- src/physics/wilsonD.nim | 2 +- src/physics/wilsonSolve.nim | 4 ++-- 7 files changed, 10 insertions(+), 9 deletions(-) diff --git a/configure b/configure index 4c78216..d4616f8 100755 --- a/configure +++ b/configure @@ -15,5 +15,6 @@ if [ "X$NIM" = "X" ]; then fi echo "Using Nim compiler: $NIM" -echo "Runing: $NIM $dir/build/configure.nims ${@@Q}" +echo "Runing: $NIM $dir/build/configure.nims $@" +#echo "Runing: $NIM $dir/build/configure.nims ${@@Q}" $NIM $dir/build/configure.nims "$@" diff --git a/src/examples/testlapl1.nim b/src/examples/testlapl1.nim index 925751b..d890ca7 100644 --- a/src/examples/testlapl1.nim +++ b/src/examples/testlapl1.nim @@ -5,7 +5,7 @@ import io/timesliceIo import physics/wilsonD import physics/wilsonSolve import contract -import xmlparser, xmltree, strutils, sequtils, endians, times +import xmlparser, xmltree, strutils, sequtils, times template `&&`(x: int32): untyped = cast[pointer](unsafeAddr(x)) template `&&`(x: Field): untyped = cast[pointer](unsafeAddr(x[0])) diff --git a/src/io/crc32.nim b/src/io/crc32.nim index e06c6a4..f994f56 100644 --- a/src/io/crc32.nim +++ b/src/io/crc32.nim @@ -1,4 +1,4 @@ -import strutils +#import strutils type Crc32* = uint32 const InitCrc32* = Crc32(not 0'u32) diff --git a/src/io/readerQiolite.nim b/src/io/readerQiolite.nim index 0a72248..d4d421d 100644 --- a/src/io/readerQiolite.nim +++ b/src/io/readerQiolite.nim @@ -111,7 +111,7 @@ proc putP[T](buf: cstring; index: int, count: int; arg: openArray[ptr T]) = template `&`[T](x: seq[T]): untyped = cast[ptr T](unsafeaddr x[0]) template `+`(x: ptr char, i: SomeInteger): untyped = - cast[ptr char](cast[ByteAddress](x) + ByteAddress(i)) + cast[ptr char](cast[uint](x) + uint(i)) proc read[T](rd: var Reader, v: openArray[ptr T]) = let t0 = getMonoTime() diff --git a/src/io/writerQiolite.nim b/src/io/writerQiolite.nim index 28cd795..db52f80 100644 --- a/src/io/writerQiolite.nim +++ b/src/io/writerQiolite.nim @@ -1,6 +1,6 @@ import base import layout -import strutils, strformat +import strformat import field import os import scidacio @@ -87,7 +87,7 @@ proc getP[T](buf: cstring; index: int; count: int; arg: openArray[ptr T]) = template `&`[T](x: seq[T]): untyped = cast[ptr T](unsafeaddr x[0]) template `+`(x: ptr char, i: SomeInteger): untyped = - cast[ptr char](cast[ByteAddress](x) + ByteAddress(i)) + cast[ptr char](cast[uint](x) + uint(i)) proc write[T](wr: var Writer, v: var openArray[ptr T], lat: openArray[int], md="", ps="") = diff --git a/src/physics/wilsonD.nim b/src/physics/wilsonD.nim index d5890f7..a6ecca1 100644 --- a/src/physics/wilsonD.nim +++ b/src/physics/wilsonD.nim @@ -11,7 +11,7 @@ setForceInline(false) import os import times -import strUtils +#import strutils import base import layout diff --git a/src/physics/wilsonSolve.nim b/src/physics/wilsonSolve.nim index cae6fbd..1e2ba13 100644 --- a/src/physics/wilsonSolve.nim +++ b/src/physics/wilsonSolve.nim @@ -2,7 +2,7 @@ import physics/wilsonD import os import times -import strUtils +#import strutils import base import layout import field @@ -11,7 +11,7 @@ import qcdTypes import solvers/cg export cg import solvers/bicgstab -import solvers/solverUtils +#import solvers/solverUtils #import types #import profile #import metaUtils From efcb502d998ece5d0f127ff2e8d04dfed9c21390 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Fri, 15 Nov 2024 09:57:05 -0600 Subject: [PATCH 08/23] prepare for update to Color --- src/physics/color.nim | 292 +-------------------------------------- src/physics/colorOld.nim | 289 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 293 insertions(+), 288 deletions(-) create mode 100644 src/physics/colorOld.nim diff --git a/src/physics/color.nim b/src/physics/color.nim index 91dc40d..5d697c4 100644 --- a/src/physics/color.nim +++ b/src/physics/color.nim @@ -1,289 +1,5 @@ -import base/basicOps -import base/wrapperTypes -export wrapperTypes -import maths/types -import maths -import simd/simdWrap +import colorOld +export colorOld -makeWrapperType(Color): - ## wrapper type for colored objects - -type - Color2*[T] = Color[T] - Color3*[T] = Color[T] - Color4*[T] = Color[T] - -template asVarWrapper*(x: Color, y: typed): untyped = - #static: echo "asVarWrapper Color" - #var cy = asColor(y) - #cy - asVar(asColor(y)) - -template index*[T,I](x: typedesc[Color[T]], i: typedesc[I]): typedesc = - when I is Color: - index(T.type, I.type[]) - elif I.isWrapper: - Color[index(T.type, I.type)] - else: - index(T.type, I.type) - -template `[]`*[T](x: Color, i: T): untyped = - when T is Color: - x[][i[]] - elif T.isWrapper: - #indexed(x, i) - var tColorBracket = asColor(x[][i]) - tColorBracket - else: - x[][i] -template `[]`*(x: Color, i,j: SomeInteger): auto = x[][i,j] - -#[ -template `[]=`*[T](x: Color, i:T, y: typed): untyped = - when T.isWrapper and T isnot Color: - x[][asScalar(i)] = y - else: - x[][i] = y -]# -template `[]=`*[T](x: Color, i: T; y: auto) = - when T is Color2: - x[][i[]] = y - elif T.isWrapper: - #indexed(x, i) - var tColorBracket = asColor(x[][i]) - tColorBracket := y - else: - x[][i] = y -template `[]=`*(x: Color, i,j: SomeInteger, y: auto) = - x[][i,j] = y - -# forward from value to value -template forwardVV(f: untyped) {.dirty.} = - template f*(x: Color): auto = - mixin f - f(x[]) -# forward from value to type -#template forwardVT(f: untyped) {.dirty.} = -# template f*[T](x: Color[T]): untyped = -# mixin f -# f(type T) -# forward from type to type -template forwardTT(f: untyped) {.dirty.} = - template f*[T](x: typedesc[Color[T]]): auto = - mixin f - f(type T) -# forward from type to type and wrap -template forwardTTW(f: untyped) {.dirty.} = - template f*[T](x: typedesc[Color[T]]): auto = - mixin f - Color[f(type T)] - -forwardVV(len) -forwardVV(nrows) -forwardVV(ncols) -forwardVV(nVectors) -forwardVV(simdType) -#forwardVV(simdLength) -forwardVV(getNs) -forwardVV(numNumbers) - -forwardTT(len) -forwardTT(nrows) -forwardTT(ncols) -forwardTT(nVectors) -forwardTT(simdType) -forwardTT(simdLength) -forwardTT(getNs) -forwardTT(numberType) - -forwardTTW(toSingle) -forwardTTW(toDouble) - -#template eval*[T](x: typedesc[Color[T]]): typedesc = asColor(eval(type T)) -template eval*[T:Color](x: typedesc[T]): typedesc = asColor(eval((type T)[])) - -template has*[T:Color](x: typedesc[T], y: typedesc): bool = - mixin has - when y is Color: true - else: has(T.type[], y) - -template row*(x: Color, i: untyped): untyped = - mixin row - asColor(row(x[],i)) -template setRow*(r: Color; x: Color2; i: untyped): untyped = - setRow(r[], x[], i) - -template getNc*[T](x: Color[T]): untyped = - when T is Mat1: - x[].nrows - elif T is Vec1: - x[].len - else: - static: - echo "error: unknown Nc" - echo x.repr - echo type(x).name - qexExit 1 - 0 - -template binDDRet(fn,wr,T1,T2) = - template fn*(x: T1, y: T2): untyped = - wr(fn(x[], y[])) - #template fn*(x: T1, y: T2): auto = - # var tmp {.noInit.}: wr(evalType(fn(x[],y[])) - # wr(fn(x[], y[])) - -#binDDRet(`+`, asColor, Color, Color2) -binDDRet(`-`, asColor, Color, Color2) -#binDDRet(`*`, asColor, Color, Color2) -binDDRet(`/`, asColor, Color, Color2) - -setBinop(`+`, add, Color, Color2, asColor(evalType(x[]+y[]))) -setBinop(`*`, mul, Color, Color2, asColor(evalType(x[]*y[]))) - -template load1*(x: Color): untyped = asColor(load1(x[])) -template `-`*(x: Color): untyped = asColor(-(x[])) -template assign*(r: var Color, x: SomeNumber) = - assign(r[], x) -template assign*(r: var Color, x: AsComplex) = - assign(r[], x) -template assign*(r: var Color, x: Color2) = - assign(r[], x[]) -template `:=`*(r: var Color, x: SomeNumber) = - r[] := x -template `:=`*(r: var Color, x: AsComplex) = - r[] := x -template `:=`*(r: var Color, x: Color2) = - r[] := x[] -template `+=`*(r: var Color, x: Color2) = - r[] += x[] -template `-=`*(r: var Color, x: Color2) = - r[] -= x[] -template `*=`*(r: var Color, x: SomeNumber) = - r[] *= x -template iadd*(r: var Color, x: SomeNumber) = - iadd(r[], x) -template iadd*(r: var Color, x: AsComplex) = - iadd(r[], x) -template iadd*(r: var Color, x: Color2) = - iadd(r[], x[]) -template isub*(r: var Color, x: SomeNumber) = - isub(r[], x) -template isub*(r: var Color, x: Color2) = - isub(r[], x[]) -template imul*(r: var Color, x: SomeNumber) = - imul(r[], x) -template imadd*(r: var Color, x: Color2, y: Color3) = - imadd(r[], x[], y[]) -template imadd*(r: var Color, x: AsComplex, y: Color3) = - imadd(r[], x, y[]) -template imsub*(r: var Color, x: Color2, y: Color3) = - imsub(r[], x[], y[]) -template peqOuter*(r: var Color, x: Color2, y: Color3) = - peqOuter(r[], x[], y[]) -template meqOuter*(r: var Color, x: Color2, y: Color3) = - meqOuter(r[], x[], y[]) -template `+`*(r: Color, x: SomeNumber): untyped = - asColor(r[] + x) -template `+`*(r: SomeNumber, x: Color): untyped = - asColor(r + x[]) -template `-`*(r: Color, x: SomeNumber): untyped = - asColor(r[] - x) -template `+`*(r: Color, x: AsComplex): untyped = - asColor(r[] + x) -template `-`*(r: Color, x: AsComplex): untyped = - asColor(r[] - x) -template add*(r: var Color, x: Color2, y: Color3) = - add(r[], x[], y[]) -template sub*(r: var Color, x: Color2, y: Color3) = - sub(r[], x[], y[]) -template `*`*(x: Color, y: SomeNumber): untyped = - asColor(x[] * y) -#template `*`*(x: SomeNumber, y: Color): auto = - #asColor(x * y[]) -template `*`*[X:SomeNumber,Y:Color](x: X, y: Y): auto = - #static: echo "SomeNumber * Color" - #var tmp {.noInit.}: asColor(type(X)*type(Y)[]) - var tmp {.noInit.}: asColor(evalType(x*y[])) - #static: echo $type(tmp) - mul(tmp[], x, y[]) - tmp -template `*`*(x: Simd, y: Color2): untyped = - asColor(x * y[]) -#template `*`*(x: AsReal, y: Color2): untyped = -# asColor(x * y[]) -template `*`*[X:AsReal,Y:Color](x: X, y: Y): auto = - #var tmp {.noInit.}: asColor(type(X)*type(Y)[]) - var tmp {.noInit.}: asColor(evalType(x*y[])) - mul(tmp[], x, y[]) - tmp -template `*`*(x: AsImag, y: Color2): untyped = - asColor(x * y[]) -template `*`*(x: AsComplex, y: Color2): untyped = - asColor(x * y[]) -template `*`*(x: Color, y: AsComplex): untyped = - asColor(x[] * y) -#template mul*(x: SomeNumber, y: Color2): untyped = -# asColor(`*`(x, y[])) -template mul*[X:SomeNumber,Y:Color](x: X, y: Y): auto = - #var tmp {.noInit.}: asColor(type(X)*type(Y)[]) - var tmp {.noInit.}: asColor(evalType(x*y[])) - mul(tmp[], x, y[]) - tmp -template mul*(x: Color, y: Color2): untyped = - asColor(`*`(x[], y[])) -template mul*(r: var Color, x: Color2, y: Color3) = - mul(r[], x[], y[]) -template mul*(r: var Color, x: SomeNumber, y: Color3) = - mul(r[], x, y[]) -template mul*(r: var Color, x: Color2, y: SomeNumber) = - mul(r[], x[], y) -template mul*(r: var Color, x: AsComplex, y: Color3) = - mul(r[], x, y[]) -template mul*(x: AsComplex, y: Color2): untyped = - asColor(mul(x, y[])) -template random*(x: var Color) = - gaussian(x[], r) -template gaussian*(x: Color, r: untyped) = - gaussian(x[], r) -template uniform*(x: var Color, r: var untyped) = - uniform(x[], r) -template z4*(x: var Color, r: var untyped) = - z4(x[], r) -template z2*(x: var Color, r: var untyped) = - z2(x[], r) -template u1*(x: var Color, r: var untyped) = - u1(x[], r) -template projectU*(r: var Color) = - projectU(r[]) -template projectU*(r: var Color, x: Color2) = - projectU(r[], x[]) -template projectUderiv*(r: var Color, u: Color2, x: Color3, chain: Color4) = - projectUderiv(r[], u[], x[], chain[]) -template projectUderiv*(r: var Color, x: Color3, chain: Color4) = - projectUderiv(r[], x[], chain[]) -template projectSU*(r: var Color) = - projectSU(r[]) -template projectSU*(r: var Color, x: Color2) = - projectSU(r[], x[]) -template projectTAH*(r: var Color) = - projectTAH(r[]) -template projectTAH*(r: var Color, x: Color2) = - projectTAH(r[], x[]) -template checkU*(x: Color):untyped = checkU(x[]) -template checkSU*(x: Color):untyped = checkSU(x[]) -template norm2*(x: Color): untyped = norm2(x[]) -template norm2*(r: var auto, x: Color): untyped = norm2(r, x[]) -template inorm2*(r: var auto, x: Color2) = inorm2(r, x[]) -template dot*(x: Color, y: Color2): untyped = - dot(x[], y[]) -template idot*(r: var auto, x: Color2, y: Color3) = idot(r, x[], y[]) -template redot*(x: Color, y: Color2): untyped = - redot(x[], y[]) -template trace*(x: Color): untyped = trace(x[]) -template simdSum*(x: Color): untyped = asColor(simdSum(x[])) -template re*(x: Color): untyped = asColor(re(x[])) -template im*(x: Color): untyped = asColor(im(x[])) -template exp*(x: Color): untyped = asColor(exp(x[])) -template expDeriv*(x: Color, c: Color2): untyped = asColor(expDeriv(x[], c[])) -template ln*(x: Color): untyped = asColor(ln(x[])) +#import colorTensor +#export colorTensor diff --git a/src/physics/colorOld.nim b/src/physics/colorOld.nim new file mode 100644 index 0000000..91dc40d --- /dev/null +++ b/src/physics/colorOld.nim @@ -0,0 +1,289 @@ +import base/basicOps +import base/wrapperTypes +export wrapperTypes +import maths/types +import maths +import simd/simdWrap + +makeWrapperType(Color): + ## wrapper type for colored objects + +type + Color2*[T] = Color[T] + Color3*[T] = Color[T] + Color4*[T] = Color[T] + +template asVarWrapper*(x: Color, y: typed): untyped = + #static: echo "asVarWrapper Color" + #var cy = asColor(y) + #cy + asVar(asColor(y)) + +template index*[T,I](x: typedesc[Color[T]], i: typedesc[I]): typedesc = + when I is Color: + index(T.type, I.type[]) + elif I.isWrapper: + Color[index(T.type, I.type)] + else: + index(T.type, I.type) + +template `[]`*[T](x: Color, i: T): untyped = + when T is Color: + x[][i[]] + elif T.isWrapper: + #indexed(x, i) + var tColorBracket = asColor(x[][i]) + tColorBracket + else: + x[][i] +template `[]`*(x: Color, i,j: SomeInteger): auto = x[][i,j] + +#[ +template `[]=`*[T](x: Color, i:T, y: typed): untyped = + when T.isWrapper and T isnot Color: + x[][asScalar(i)] = y + else: + x[][i] = y +]# +template `[]=`*[T](x: Color, i: T; y: auto) = + when T is Color2: + x[][i[]] = y + elif T.isWrapper: + #indexed(x, i) + var tColorBracket = asColor(x[][i]) + tColorBracket := y + else: + x[][i] = y +template `[]=`*(x: Color, i,j: SomeInteger, y: auto) = + x[][i,j] = y + +# forward from value to value +template forwardVV(f: untyped) {.dirty.} = + template f*(x: Color): auto = + mixin f + f(x[]) +# forward from value to type +#template forwardVT(f: untyped) {.dirty.} = +# template f*[T](x: Color[T]): untyped = +# mixin f +# f(type T) +# forward from type to type +template forwardTT(f: untyped) {.dirty.} = + template f*[T](x: typedesc[Color[T]]): auto = + mixin f + f(type T) +# forward from type to type and wrap +template forwardTTW(f: untyped) {.dirty.} = + template f*[T](x: typedesc[Color[T]]): auto = + mixin f + Color[f(type T)] + +forwardVV(len) +forwardVV(nrows) +forwardVV(ncols) +forwardVV(nVectors) +forwardVV(simdType) +#forwardVV(simdLength) +forwardVV(getNs) +forwardVV(numNumbers) + +forwardTT(len) +forwardTT(nrows) +forwardTT(ncols) +forwardTT(nVectors) +forwardTT(simdType) +forwardTT(simdLength) +forwardTT(getNs) +forwardTT(numberType) + +forwardTTW(toSingle) +forwardTTW(toDouble) + +#template eval*[T](x: typedesc[Color[T]]): typedesc = asColor(eval(type T)) +template eval*[T:Color](x: typedesc[T]): typedesc = asColor(eval((type T)[])) + +template has*[T:Color](x: typedesc[T], y: typedesc): bool = + mixin has + when y is Color: true + else: has(T.type[], y) + +template row*(x: Color, i: untyped): untyped = + mixin row + asColor(row(x[],i)) +template setRow*(r: Color; x: Color2; i: untyped): untyped = + setRow(r[], x[], i) + +template getNc*[T](x: Color[T]): untyped = + when T is Mat1: + x[].nrows + elif T is Vec1: + x[].len + else: + static: + echo "error: unknown Nc" + echo x.repr + echo type(x).name + qexExit 1 + 0 + +template binDDRet(fn,wr,T1,T2) = + template fn*(x: T1, y: T2): untyped = + wr(fn(x[], y[])) + #template fn*(x: T1, y: T2): auto = + # var tmp {.noInit.}: wr(evalType(fn(x[],y[])) + # wr(fn(x[], y[])) + +#binDDRet(`+`, asColor, Color, Color2) +binDDRet(`-`, asColor, Color, Color2) +#binDDRet(`*`, asColor, Color, Color2) +binDDRet(`/`, asColor, Color, Color2) + +setBinop(`+`, add, Color, Color2, asColor(evalType(x[]+y[]))) +setBinop(`*`, mul, Color, Color2, asColor(evalType(x[]*y[]))) + +template load1*(x: Color): untyped = asColor(load1(x[])) +template `-`*(x: Color): untyped = asColor(-(x[])) +template assign*(r: var Color, x: SomeNumber) = + assign(r[], x) +template assign*(r: var Color, x: AsComplex) = + assign(r[], x) +template assign*(r: var Color, x: Color2) = + assign(r[], x[]) +template `:=`*(r: var Color, x: SomeNumber) = + r[] := x +template `:=`*(r: var Color, x: AsComplex) = + r[] := x +template `:=`*(r: var Color, x: Color2) = + r[] := x[] +template `+=`*(r: var Color, x: Color2) = + r[] += x[] +template `-=`*(r: var Color, x: Color2) = + r[] -= x[] +template `*=`*(r: var Color, x: SomeNumber) = + r[] *= x +template iadd*(r: var Color, x: SomeNumber) = + iadd(r[], x) +template iadd*(r: var Color, x: AsComplex) = + iadd(r[], x) +template iadd*(r: var Color, x: Color2) = + iadd(r[], x[]) +template isub*(r: var Color, x: SomeNumber) = + isub(r[], x) +template isub*(r: var Color, x: Color2) = + isub(r[], x[]) +template imul*(r: var Color, x: SomeNumber) = + imul(r[], x) +template imadd*(r: var Color, x: Color2, y: Color3) = + imadd(r[], x[], y[]) +template imadd*(r: var Color, x: AsComplex, y: Color3) = + imadd(r[], x, y[]) +template imsub*(r: var Color, x: Color2, y: Color3) = + imsub(r[], x[], y[]) +template peqOuter*(r: var Color, x: Color2, y: Color3) = + peqOuter(r[], x[], y[]) +template meqOuter*(r: var Color, x: Color2, y: Color3) = + meqOuter(r[], x[], y[]) +template `+`*(r: Color, x: SomeNumber): untyped = + asColor(r[] + x) +template `+`*(r: SomeNumber, x: Color): untyped = + asColor(r + x[]) +template `-`*(r: Color, x: SomeNumber): untyped = + asColor(r[] - x) +template `+`*(r: Color, x: AsComplex): untyped = + asColor(r[] + x) +template `-`*(r: Color, x: AsComplex): untyped = + asColor(r[] - x) +template add*(r: var Color, x: Color2, y: Color3) = + add(r[], x[], y[]) +template sub*(r: var Color, x: Color2, y: Color3) = + sub(r[], x[], y[]) +template `*`*(x: Color, y: SomeNumber): untyped = + asColor(x[] * y) +#template `*`*(x: SomeNumber, y: Color): auto = + #asColor(x * y[]) +template `*`*[X:SomeNumber,Y:Color](x: X, y: Y): auto = + #static: echo "SomeNumber * Color" + #var tmp {.noInit.}: asColor(type(X)*type(Y)[]) + var tmp {.noInit.}: asColor(evalType(x*y[])) + #static: echo $type(tmp) + mul(tmp[], x, y[]) + tmp +template `*`*(x: Simd, y: Color2): untyped = + asColor(x * y[]) +#template `*`*(x: AsReal, y: Color2): untyped = +# asColor(x * y[]) +template `*`*[X:AsReal,Y:Color](x: X, y: Y): auto = + #var tmp {.noInit.}: asColor(type(X)*type(Y)[]) + var tmp {.noInit.}: asColor(evalType(x*y[])) + mul(tmp[], x, y[]) + tmp +template `*`*(x: AsImag, y: Color2): untyped = + asColor(x * y[]) +template `*`*(x: AsComplex, y: Color2): untyped = + asColor(x * y[]) +template `*`*(x: Color, y: AsComplex): untyped = + asColor(x[] * y) +#template mul*(x: SomeNumber, y: Color2): untyped = +# asColor(`*`(x, y[])) +template mul*[X:SomeNumber,Y:Color](x: X, y: Y): auto = + #var tmp {.noInit.}: asColor(type(X)*type(Y)[]) + var tmp {.noInit.}: asColor(evalType(x*y[])) + mul(tmp[], x, y[]) + tmp +template mul*(x: Color, y: Color2): untyped = + asColor(`*`(x[], y[])) +template mul*(r: var Color, x: Color2, y: Color3) = + mul(r[], x[], y[]) +template mul*(r: var Color, x: SomeNumber, y: Color3) = + mul(r[], x, y[]) +template mul*(r: var Color, x: Color2, y: SomeNumber) = + mul(r[], x[], y) +template mul*(r: var Color, x: AsComplex, y: Color3) = + mul(r[], x, y[]) +template mul*(x: AsComplex, y: Color2): untyped = + asColor(mul(x, y[])) +template random*(x: var Color) = + gaussian(x[], r) +template gaussian*(x: Color, r: untyped) = + gaussian(x[], r) +template uniform*(x: var Color, r: var untyped) = + uniform(x[], r) +template z4*(x: var Color, r: var untyped) = + z4(x[], r) +template z2*(x: var Color, r: var untyped) = + z2(x[], r) +template u1*(x: var Color, r: var untyped) = + u1(x[], r) +template projectU*(r: var Color) = + projectU(r[]) +template projectU*(r: var Color, x: Color2) = + projectU(r[], x[]) +template projectUderiv*(r: var Color, u: Color2, x: Color3, chain: Color4) = + projectUderiv(r[], u[], x[], chain[]) +template projectUderiv*(r: var Color, x: Color3, chain: Color4) = + projectUderiv(r[], x[], chain[]) +template projectSU*(r: var Color) = + projectSU(r[]) +template projectSU*(r: var Color, x: Color2) = + projectSU(r[], x[]) +template projectTAH*(r: var Color) = + projectTAH(r[]) +template projectTAH*(r: var Color, x: Color2) = + projectTAH(r[], x[]) +template checkU*(x: Color):untyped = checkU(x[]) +template checkSU*(x: Color):untyped = checkSU(x[]) +template norm2*(x: Color): untyped = norm2(x[]) +template norm2*(r: var auto, x: Color): untyped = norm2(r, x[]) +template inorm2*(r: var auto, x: Color2) = inorm2(r, x[]) +template dot*(x: Color, y: Color2): untyped = + dot(x[], y[]) +template idot*(r: var auto, x: Color2, y: Color3) = idot(r, x[], y[]) +template redot*(x: Color, y: Color2): untyped = + redot(x[], y[]) +template trace*(x: Color): untyped = trace(x[]) +template simdSum*(x: Color): untyped = asColor(simdSum(x[])) +template re*(x: Color): untyped = asColor(re(x[])) +template im*(x: Color): untyped = asColor(im(x[])) +template exp*(x: Color): untyped = asColor(exp(x[])) +template expDeriv*(x: Color, c: Color2): untyped = asColor(expDeriv(x[], c[])) +template ln*(x: Color): untyped = asColor(ln(x[])) From 17561618c465885cb2db348ee45fa4698c83372c Mon Sep 17 00:00:00 2001 From: James Osborn Date: Fri, 15 Nov 2024 09:58:32 -0600 Subject: [PATCH 09/23] prepare for update to Spin --- src/physics/spin.nim | 440 +--------------------------------------- src/physics/spinOld.nim | 437 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 441 insertions(+), 436 deletions(-) create mode 100644 src/physics/spinOld.nim diff --git a/src/physics/spin.nim b/src/physics/spin.nim index 4e73d40..2a07f1b 100644 --- a/src/physics/spin.nim +++ b/src/physics/spin.nim @@ -1,437 +1,5 @@ -import base/wrapperTypes -import maths/types -import maths +import spinOld +export spinOld -makeWrapperType(Spin): - ## wrapper type for spin objects - -type - #Spin*[T] = object - # v*: T - Spin2*[T] = Spin[T] - Spin3*[T] = Spin[T] - #SpinMatrix[I,J:static[int],T] = Spin[MatrixArray[I,J,T]] - -#template asSpin*(xx: typed): untyped = -# #staticTraceBegin: asSpin -# let x_asSpin = xx -# #staticTraceEnd: asSpin -# Spin[type(x_asSpin)](v: x_asSpin) - -#template isWrapper*(x: Spin): untyped = true -#template asWrapper*(x: Spin, y: typed): untyped = -# asSpin(y) -template asVarWrapper*(x: Spin, y: typed): untyped = - asVar(asSpin(y)) - -template index*[T,I](x: typedesc[Spin[T]], i: typedesc[I]): typedesc = - when I is Spin: - index(T.type, I.type[]) - elif I.isWrapper: - Spin[index(T.type, I.type)] - else: - index(T.type, I.type) - -template `[]`*[T](x: Spin, i: T): untyped = - when T is Spin: - x[][i[]] - elif T.isWrapper: - #indexed(x, i) - var tSpinBracket = asSpin(x[][i]) - tSpinBracket - else: - x[][i] -template `[]`*(x: Spin, i,j: typed): untyped = x[][i,j] -#template `[]=`*[T](x: Spin, i: T; y: typed) = -proc `[]=`*[T](x: Spin, i: T; y: auto) = - when T is Spin: - x[][i[]] = y - elif T.isWrapper: - #indexed(x, i) - var tSpinBracket = asSpin(x[][i]) - tSpinBracket := y - else: - x[][i] = y -template `[]=`*(x: Spin, i,j,y: typed) = - x[][i,j] = y - -# forward from type to type -template forwardTT(f: untyped) {.dirty.} = - template f*[T](x: typedesc[Spin[T]]): auto = - mixin f - f(type T) -# forward from type to type and wrap -template forwardTTW(f: untyped) {.dirty.} = - template f*[T](x: typedesc[Spin[T]]): auto = - mixin f - Spin[f(type T)] - -forwardFunc(Spin, len) -forwardFunc(Spin, nrows) -forwardFunc(Spin, ncols) -forwardFunc(Spin, getNc) -forwardFunc(Spin, numberType) -forwardFunc(Spin, numNumbers) -forwardFunc(Spin, nVectors) -forwardFunc(Spin, simdType) -forwardFunc(Spin, simdLength) - -forwardTT(len) -forwardTT(nrows) -forwardTT(ncols) -forwardTT(nVectors) -forwardTT(simdType) -forwardTT(simdLength) -forwardTT(getNc) -forwardTT(numberType) - -forwardTTW(toSingle) -forwardTTW(toDouble) - -template eval*[T](x: typedesc[Spin[T]]): typedesc = asSpin(eval(type T)) - -template has*[T](x: typedesc[Spin[T]], y: typedesc): bool = - mixin has - when y is Spin2: - true - else: - has(type T, y) - -template row*(x: Spin, i: typed): untyped = - mixin row - asSpin(row(x[],i)) -template setRow*(r: Spin; x: Spin2; i: int): untyped = - setRow(r[], x[], i) - -template getNs*[T](x: Spin[T]): auto = - when T is Mat1: - x[].nrows - elif T is Vec1: - x[].len - else: - static: - echo "error: unknown Nc" - echo x.repr - echo type(x).name - qexExit 1 - 0 - -template binDDRet(fn,wr,T1,T2) = - template fn*(x: T1, y: T2): untyped = - #echoUntyped: fn - wr(fn(x[], y[])) - -binDDRet(`+`, asSpin, Spin, Spin2) -binDDRet(`-`, asSpin, Spin, Spin2) -binDDRet(`*`, asSpin, Spin, Spin2) -binDDRet(`/`, asSpin, Spin, Spin2) - -#template numberType*[T](x: typedesc[Spin[T]]): untyped = numberType(type(T)) -#template numNumbers*[T](x: typedesc[Spin[T]]): untyped = numberType(T) -#template numNumbers*(x: Spin): untyped = numNumbers(x[]) -#template toSingle*[T](x: typedesc[Spin[T]]): untyped = Spin[toSingle(type(T))] -template load1*(x: Spin): untyped = asSpin(load1(x[])) -template assign*(r: var Spin, x: SomeNumber) = - assign(r[], x) -template assign*(r: var Spin, x: AsComplex) = - assign(r[], x) -template assign*(r: var Spin, x: Spin2) = - assign(r[], x[]) -template `:=`*(r: var Spin, x: SomeNumber) = - `:=`(r[], x) -template `:=`*(r: var Spin, x: Spin2) = - r[] := x[] -template `+=`*(r: var Spin, x: Spin2) = - staticTraceBegin: peqSpinSpin - r[] += x[] - staticTraceEnd: peqSpinSpin -template `-=`*(r: var Spin, x: Spin2) = - r[] -= x[] -template `*=`*(r: var Spin, x: SomeNumber) = - `*=`(r[], x) -template iadd*(r: var Spin, x: AsComplex) = - iadd(r[], x) -template iadd*(r: var Spin, x: Spin2) = - iadd(r[], x[]) -template isub*(r: var Spin, x: Spin2) = - isub(r[], x[]) -template imul*(r: var Spin, x: SomeNumber) = - imul(r[], x) -template imadd*(r: var Spin, x: Spin2, y: Spin3) = - imadd(r[], x[], y[]) -template imsub*(r: var Spin, x: Spin2, y: Spin3) = - imsub(r[], x[], y[]) -template `*`*(x: Spin, y: SomeNumber): untyped = - asSpin(x[] * y) -template `*`*(x: SomeNumber, y: Spin2): untyped = - asSpin(x * y[]) -template `*`*(x: AsComplex, y: Spin2): untyped = - asSpin(x * y[]) -template sub*(r: var Spin, x: Spin2, y: Spin3) = - sub(r[], x[], y[]) -template mul*(r: var Spin, x: Spin2, y: Spin3) = - mul(r[], x[], y[]) -template random*(x: var Spin) = - gaussian(x[], r) -template gaussian*(x: var Spin, r: var untyped) = - gaussian(x[], r) -template uniform*(x: var Spin, r: var untyped) = - uniform(x[], r) -template z4*(x: var Spin, r: var untyped) = - z4(x[], r) -template z2*(x: var Spin, r: var untyped) = - z2(x[], r) -template u1*(x: var Spin, r: var untyped) = - u1(x[], r) -template projectU*(r: var Spin, x: Spin2) = - projectU(r[], x[]) -template norm2*(x: Spin): untyped = norm2(x[]) -template inorm2*(r: var typed, x: Spin2) = inorm2(r, x[]) -template dot*(x: Spin, y: Spin2): untyped = - dot(x[], y[]) -template idot*(r: var typed, x: Spin2, y: Spin3) = idot(r, x[], y[]) -template redot*(x: Spin, y: Spin2): untyped = - redot(x[], y[]) -template trace*(x: Spin): untyped = trace(x[]) - -template spinVector*(x: static[int], a: untyped): untyped = - #const - # I:int = x - #type - # E = T - # VA = VectorArray[I,E] - # VAO = VectorArrayObj[I,E] - #Spin[VA](v: VA(v: a)) - #Spin[VA](v: VA(v: VAO(vec: a))) - #asSpin(VA(v: a)) - #static: echo "spinVector" - asSpin(asVectorArray(a)) - #let t1 = asVectorArray(a) - #static: echo "spinVector1" - #let t = asSpin(t1) - #static: echo "spinVector2" - #t -template spinMatrix*[T](x,y:static[int], a: untyped): untyped = - const - I:int = x - J:int = y - type - E = T - #MA = MatrixArray[I,J,E] - MAO = MatrixArrayObj[I,J,E] - asSpin(asMatrix(MAO(mat: a))) -#template spinMatrix*[I,J:static[int],T](a: untyped): untyped = -# Spin[MatrixArray[I,J,T]](v: MatrixArray[I,J,T](v: MatrixArrayObj[I,J,T](mat: a))) -#template spinMatrix*(I,J,T,a: untyped): untyped = -# Spin[MatrixArray[I,J,T]](v: MatrixArray[I,J,T](v: MatrixArrayObj[I,J,T](mat: a))) - -const z0 = newComplex( 0.0, 0.0) -const z1 = newComplex( 1.0, 0.0) -const zi = newComplex( 0.0, 1.0) -const n1 = newComplex(-1.0, 0.0) -const ni = newComplex( 0.0, -1.0) - -template s(r,c,x: untyped): untyped = - spinMatrix[ComplexType[float]](r,c,x) -template g(x: untyped): untyped = s(4,4,x) -template p(x: untyped): untyped = s(2,4,x) -template r(x: untyped): untyped = s(4,2,x) - -const - gamma0* = g([[ z1, z0, z0, z0 ], - [ z0, z1, z0, z0 ], - [ z0, z0, z1, z0 ], - [ z0, z0, z0, z1 ]]) - gamma1* = g([[ z0, z0, z0, zi ], - [ z0, z0, zi, z0 ], - [ z0, ni, z0, z0 ], - [ ni, z0, z0, z0 ]]) - gamma2* = g([[ z0, z0, z0, n1 ], - [ z0, z0, z1, z0 ], - [ z0, z1, z0, z0 ], - [ n1, z0, z0, z0 ]]) - gamma3* = g([[ z0, z0, zi, z0 ], - [ z0, z0, z0, ni ], - [ ni, z0, z0, z0 ], - [ z0, zi, z0, z0 ]]) - gamma4* = g([[ z0, z0, z1, z0 ], - [ z0, z0, z0, z1 ], - [ z1, z0, z0, z0 ], - [ z0, z1, z0, z0 ]]) - gamma5* = g([[ z1, z0, z0, z0 ], - [ z0, z1, z0, z0 ], - [ z0, z0, n1, z0 ], - [ z0, z0, z0, n1 ]]) - - spprojmat1p* = p([[ z1, z0, z0, zi ], - [ z0, z1, zi, z0 ]]) - spprojmat1m* = p([[ z1, z0, z0, ni ], - [ z0, z1, ni, z0 ]]) - spprojmat2p* = p([[ z1, z0, z0, n1 ], - [ z0, z1, z1, z0 ]]) - spprojmat2m* = p([[ z1, z0, z0, z1 ], - [ z0, z1, n1, z0 ]]) - spprojmat3p* = p([[ z1, z0, zi, z0 ], - [ z0, z1, z0, ni ]]) - spprojmat3m* = p([[ z1, z0, ni, z0 ], - [ z0, z1, z0, zi ]]) - spprojmat4p* = p([[ z1, z0, z1, z0 ], - [ z0, z1, z0, z1 ]]) - spprojmat4m* = p([[ z1, z0, n1, z0 ], - [ z0, z1, z0, n1 ]]) - - spreconmat1p* = r([[ z1, z0 ], - [ z0, z1 ], - [ z0, ni ], - [ ni, z0 ]]) - spreconmat1m* = r([[ z1, z0 ], - [ z0, z1 ], - [ z0, zi ], - [ zi, z0 ]]) - spreconmat2p* = r([[ z1, z0 ], - [ z0, z1 ], - [ z0, z1 ], - [ n1, z0 ]]) - spreconmat2m* = r([[ z1, z0 ], - [ z0, z1 ], - [ z0, n1 ], - [ z1, z0 ]]) - spreconmat3p* = r([[ z1, z0 ], - [ z0, z1 ], - [ ni, z0 ], - [ z0, zi ]]) - spreconmat3m* = r([[ z1, z0 ], - [ z0, z1 ], - [ zi, z0 ], - [ z0, ni ]]) - spreconmat4p* = r([[ z1, z0 ], - [ z0, z1 ], - [ z1, z0 ], - [ z0, z1 ]]) - spreconmat4m* = r([[ z1, z0 ], - [ z0, z1 ], - [ n1, z0 ], - [ z0, n1 ]]) - -template I(x: typed): untyped = - newImag(1)*x - -proc spproj1p*(r: var auto, x: auto) = - ## r: HalfFermion - ## x: DiracFermion - #let nc = x[0].len - #for i in 0.. Date: Fri, 15 Nov 2024 13:46:10 -0600 Subject: [PATCH 10/23] prepare for new simdArray --- src/simd/simdArray.nim | 553 +------------------------------------- src/simd/simdArrayOld.nim | 550 +++++++++++++++++++++++++++++++++++++ 2 files changed, 554 insertions(+), 549 deletions(-) create mode 100644 src/simd/simdArrayOld.nim diff --git a/src/simd/simdArray.nim b/src/simd/simdArray.nim index c2eb28a..b332519 100644 --- a/src/simd/simdArray.nim +++ b/src/simd/simdArray.nim @@ -1,550 +1,5 @@ -import macros -import base -import base/metaUtils -import maths/types -export types -#import ../basicOps +import simdArrayOld +export simdArrayOld -#getOptimPragmas() -#{.pragma: alwaysInline, codegenDecl: "inline __attribute__((always_inline)) $# $#$#".} - -template `[]`*(x: SomeNumber): untyped = x -template `[]`*(x: SomeNumber, i: typed): untyped = x - -# map (var param) (param) (return) - -template map011(T,L,op1,op2:untyped):untyped {.dirty.} = - getOptimPragmas() - proc op1*(x:T):T {.alwaysInline,noInit.} = - bind forStatic - #forStatic i, 0, L-1: - for i in 0..P: - bind map110 - map110(T, L, F, F) - else: - proc F*(r:var T; x:T) {.inline.} = - const b = (P div N0) and (L-1) - bind forStatic - forStatic i, 0, L-1: - assign(r[][i], x[][i xor b]) -template makePerm(P,T,L,N0) {.dirty.} = - bind evalBacktic, makePermX - evalBacktic: - makePermX(`"perm" P`,P,T,L,N0) -template makePackPX(F,P,T,L,N0) {.dirty.} = - # P: perm, T: vector type, L: outer vec len, N0: inner vec len - when N0>P: - proc F*(r:var openArray[SomeNumber], x:T, - l:var openarray[SomeNumber]) {.inline.} = - const N02 = N0 div 2 - let ra = cast[ptr array[L,array[N02,type(r[0])]]](r[0].addr) - let la = cast[ptr array[L,array[N02,type(l[0])]]](l[0].addr) - bind forStatic - forStatic i, 0, L-1: - F(ra[][i], x[][i], la[][i]) - else: - proc F*(r:var openArray[SomeNumber], x:T, - l:var openarray[SomeNumber]) {.inline.} = - const L2 = L div 2 - let ra = cast[ptr array[L2,array[N0,type(r[0])]]](r[0].addr) - let la = cast[ptr array[L2,array[N0,type(l[0])]]](l[0].addr) - const b = (P div N0) and (L-1) - var ir,il = 0 - bind forStatic - forStatic i, 0, L-1: - if (i and b) == 0: - when N0==1: - la[][il][0] = x[][i] - else: - assign(la[][il], x[][i]) - inc il - else: - when N0==1: - ra[][ir][0] = x[][i] - else: - assign(ra[][ir], x[][i]) - inc ir -template makePackMX(F,P,T,L,N0) {.dirty.} = - # P: perm, T: vector type, L: outer vec len, N0: inner vec len - when N0>P: - proc F*(r:var openArray[SomeNumber], x:T, - l:var openarray[SomeNumber]) {.inline.} = - const N02 = N0 div 2 - let ra = cast[ptr array[L,array[N02,type(r[0])]]](r[0].addr) - let la = cast[ptr array[L,array[N02,type(l[0])]]](l[0].addr) - bind forStatic - forStatic i, 0, L-1: - F(ra[][i], x[][i], la[][i]) - else: - proc F*(r:var openArray[SomeNumber], x:T, - l:var openarray[SomeNumber]) {.inline.} = - const L2 = L div 2 - let ra = cast[ptr array[L2,array[N0,type(r[0])]]](r[0].addr) - let la = cast[ptr array[L2,array[N0,type(l[0])]]](l[0].addr) - const b = (P div N0) and (L-1) - var ir,il = 0 - bind forStatic - forStatic i, 0, L-1: - if (i and b) == 0: - when N0==1: - ra[][ir][0] = x[][i] - else: - assign(ra[][ir], x[][i]) - inc ir - else: - when N0==1: - la[][il][0] = x[][i] - else: - assign(la[][il], x[][i]) - inc il -template makePackP(P,T,L,N0) {.dirty.} = - bind evalBacktic, makePackPX - evalBacktic: - makePackPX(`"packp" P`,P,T,L,N0) -template makePackM(P,T,L,N0) {.dirty.} = - bind evalBacktic, makePackMX - evalBacktic: - makePackMX(`"packm" P`,P,T,L,N0) -template makeBlendPX(F,P,T,L,N0) {.dirty.} = - when N0>P: - proc F*(x:var T; r:openArray[SomeNumber]; - l:openArray[SomeNumber]) {.inline.} = - const N02 = N0 div 2 - let ra = cast[ptr array[L,array[N02,type(r[0])]]](unsafeAddr(r[0])) - let la = cast[ptr array[L,array[N02,type(l[0])]]](unsafeAddr(l[0])) - forStatic i, 0, L-1: - F(x[][i], ra[][i], la[][i]) - else: - proc F*(x:var T; r:openArray[SomeNumber]; - l:openArray[SomeNumber]) {.inline.} = - const L2 = L div 2 - let ra = cast[ptr array[L2,array[N0,type(r[0])]]](unsafeAddr(r[0])) - let la = cast[ptr array[L2,array[N0,type(l[0])]]](unsafeAddr(l[0])) - const b = (P div N0) and (L-1) - var ir,il = 0 - bind forStatic - forStatic i, 0, L-1: - if (i and b) == 0: - when N0==1: - x[][i] = la[][il][0] - else: - assign(x[][i], la[][il]) - inc il - else: - when N0==1: - x[][i] = ra[][ir][0] - else: - assign(x[][i], ra[][ir]) - inc ir -template makeBlendMX(F,P,T,L,N0) {.dirty.} = - when N0>P: - proc F*(x:var T; r:openArray[SomeNumber]; - l:openArray[SomeNumber]) {.inline.} = - const N02 = N0 div 2 - let ra = cast[ptr array[L,array[N02,type(r[0])]]](unsafeAddr(r[0])) - let la = cast[ptr array[L,array[N02,type(l[0])]]](unsafeAddr(l[0])) - bind forStatic - forStatic i, 0, L-1: - F(x[][i], ra[][i], la[][i]) - else: - proc F*(x:var T; r:openArray[SomeNumber]; - l:openArray[SomeNumber]) {.inline.} = - const L2 = L div 2 - let ra = cast[ptr array[L2,array[N0,type(r[0])]]](unsafeAddr(r[0])) - let la = cast[ptr array[L2,array[N0,type(l[0])]]](unsafeAddr(l[0])) - const b = (P div N0) and (L-1) - var ir,il = 0 - bind forStatic - forStatic i, 0, L-1: - if (i and b) == 0: - when N0==1: - x[][i] = ra[][ir][0] - else: - assign(x[][i], ra[][ir]) - inc ir - else: - when N0==1: - x[][i] = la[][il][0] - else: - assign(x[][i], la[][il]) - inc il -template makeBlendP(P,T,L,N0) {.dirty.} = - bind evalBacktic, makeBlendPX - evalBacktic: - makeBlendPX(`"blendp" P`,P,T,L,N0) -template makeBlendM(P,T,L,N0) {.dirty.} = - bind evalBacktic, makeBlendMX - evalBacktic: - makeBlendMX(`"blendm" P`,P,T,L,N0) - -discard """ - proc op*(x:var T; r:openArray[SomeNumber]; - l:openArray[SomeNumber]) {.inline.} = - let ra = cast[ptr array[2,type(r[0])]](unsafeAddr(r[2])) - let la = cast[ptr array[2,type(l[0])]](unsafeAddr(l[2])) - op(x[][0], r, l) - op(x[][1], ra[], la[]) -""" - -#type SimdArrayObj*[L,B] = object -# v*: array[L,B] - -template arrayType[T](N: typed, x: typedesc[T]): untyped = - type B {.gensym.} = T.type - #static: echo "arrayType: ", N, " ", B - array[N,B] -# T = Simd{S,D}{L} = array[L,B] -# B ~ array[N0,F] -template makeSimdArray*(T:untyped;L:typed;B:typedesc):untyped {.dirty.} = -#template makeSimdArray*(L:typed;B:typedesc;T:untyped) {.dirty.} = - #static: echo "makeSimdArray: ", L, " ", $B.type - #makeSimdArray2(T, L, type B, numberType(B), numNumbers(B), L*numNumbers(B)) - makeSimdArray2(L, B, numberType(B), numNumbers(B), L*numNumbers(B), T) -#template makeSimdArray2*(T:untyped;L,B,F,N0,N:typed):untyped {.dirty.} = -#template makeSimdArray2*(T:untyped;L:typed;BB,F:typedesc;N0,N:typed):untyped {.dirty.} = -template makeSimdArray2*(L:typed;B,F:typedesc;N0,N:typed,T:untyped) {.dirty.} = - getOptimPragmas() - #static: echo "makeSimdArray2: ", L, " ", $B - bind map011, map021, map021x, map110, map120, map120x, map130, arrayType - bind makePerm, makePackP, makePackM, makeBlendP, makeBlendM - #type B {.gensym.} = typeof(BB) - #type T* = distinct array[L,B] - type T* = object - #v*: array[L,B.type] - v*: arrayType(L,B) - #v* {.align(L*sizeof(B)).}: arrayType(L,B) - #type T* = SimdArrayObj[L,B] - template isWrapper*(x:typedesc[T]): bool = false - template isWrapper*(x:T): bool = false - template numberType*(x:typedesc[T]):typedesc = F - template numberType*(x:T):typedesc = F - template numNumbers*(x:typedesc[T]):untyped = N - template numNumbers*(x:T):untyped = N - template simdType*(x:typedesc[T]):typedesc = T - template simdType*(x:T):typedesc = T - template simdLength*(x:T):untyped = N - template simdLength*(x:typedesc[T]):untyped = N - template eval*(x:T): untyped = x - template eval*(x:typedesc[T]): typedesc = T - #template `[]`*(x:T):untyped = (array[L,B])(x) - template `[]`*(x:T):untyped = x.v - template `[]=`*(x: T; y: typed) = x.v = y - when B is SomeNumber: - template `[]`*(x:T; i:SomeInteger):untyped = x[][i div N0] - template `[]=`*(x:T; i:SomeInteger; y:auto) = x[][i div N0] := y - else: - template `[]`*(x:T; i:SomeInteger):untyped = x[][i div N0][i mod N0] - template `[]=`*(x:T; i:SomeInteger; y:auto) = x[][i div N0][i mod N0] = y - template load1*(x:T):untyped = x - proc to*(x:SomeNumber; y:typedesc[T]):T {.alwaysInline,noInit.} = - bind forStatic - forStatic i, 0, L-1: - assign(result[][i], x) - #echoAst: F - #static: echo F.treerepr - #when not(F is float32): - # should be handled in simd.nim now - #when F is float64: - # template toDoubleImpl*(x: T): untyped = x - #else: - # template toSingleImpl*(x: T): untyped = x - proc simdReduce*(r: var SomeNumber; x: T) {.inline.} = - #mixin add - var y = x[][0] - forStatic i, 1, L-1: - iadd(y, x[][i]) - r = (type(r))(simdReduce(y)) - proc simdReduce*(x:T):F {.noInit,inline.} = simdReduce(result, x) - template simdSum*(r:var SomeNumber; x:T) = simdReduce(r, x) - template simdSum*(x:T):untyped = simdReduce(x) - proc simdMaxReduce*(r:var SomeNumber; x:T) {.inline.} = - mixin simdMaxReduce - var y = x[][0] - forStatic i, 1, L-1: - let c = x[][i] - y = max(y, c) - r = (type(r))(simdMaxReduce(y)) - proc simdMaxReduce*(x:T):F {.noinit,inline.} = simdMaxReduce(result, x) - template simdMax*(r:var SomeNumber; x:T) = simdMaxReduce(r, x) - template simdMax*(x:T):untyped = simdMaxReduce(x) - proc simdMinReduce*(r:var SomeNumber; x:T) {.inline.} = - mixin simdMinReduce - var y = x[][0] - forStatic i, 1, L-1: - let c = x[][i] - y = min(y, c) - r = (type(r))(simdMinReduce(y)) - proc simdMinReduce*(x:T):F {.noinit,inline.} = simdMinReduce(result, x) - template simdMin*(r:var SomeNumber; x:T) = simdMinReduce(r, x) - template simdMin*(x:T):untyped = simdMinReduce(x) - proc `-`*(x:T):T {.inline,noInit.} = - forStatic i, 0, L-1: - neg(result[][i], x[][i]) - - map011(T, L, abs, abs) - map011(T, L, trace, trace) - map011(T, L, norm2, norm2) - map011(T, L, sqrt, sqrt) - map011(T, L, rsqrt, rsqrt) - map011(T, L, inv, inv) - map011(T, L, sin, sin) - map011(T, L, cos, cos) - map011(T, L, acos, acos) - - map021(T, L, atan2, atan2) - map021(T, L, min, min) - map021(T, L, max, max) - map021(T, L, add, `+`) - map021(T, L, sub, `-`) - map021(T, L, mul, `*`) - map021(T, L, divd, `/`) - map021(T, L, `+`, `+`) - map021(T, L, `-`, `-`) - map021(T, L, `*`, `*`) - map021(T, L, `/`, `/`) - #map021(T, L, `<`, `<`) - map021(T, L, copySign, copySign) - - map110(T, L, assign, assign) - map110(T, L, neg, neg) - map110(T, L, iadd, iadd) - map110(T, L, isub, isub) - map110(T, L, imul, imul) - map110(T, L, idiv, idiv) - map110(T, L, norm2, norm2) - map110(T, L, inorm2, inorm2) - map110(T, L, rsqrt, rsqrt) - map110(T, L, `+=`, iadd) - map110(T, L, `-=`, isub) - map110(T, L, `*=`, imul) - map110(T, L, `/=`, idiv) - - map120(T, L, add, add) - map120(T, L, sub, sub) - map120(T, L, mul, mul) - map120(T, L, divd, divd) - map120(T, L, imadd, imadd) - map120(T, L, imsub, imsub) - - map130(T, L, msub, msub) - - template `:=`*(r: T; x: T): untyped = assign(r, x) - template `:=`*(r: T; x: array[L,B]) = - let xx = x - forStatic i, 0, L-1: - r[][i] = xx[i] - when N==1: - template assign*(r: SomeNumber, x: T): untyped = r = x[0] - proc assign*(r: var T; x: SomeNumber) {.alwaysInline.} = - #assign(r, x.to(T)) - forStatic i, 0, L-1: - assign(r[][i], x) - template `:=`*(r: T, x: SomeNumber) = assign(r, x) - proc assign*(r:var T; x:array[N,SomeNumber]) {.inline.} = - when compiles(assign(r[][0], unsafeAddr(x[0]))): - forStatic i, 0, L-1: - assign(r[][i], unsafeAddr(x[i*N0])) - else: - var y = x - forStatic i, 0, L-1: - assign(r[][i], unsafeAddr(y[i*N0])) - proc assign*(r:var array[N,SomeNumber], x:T) {.inline.} = - bind forStatic - when B is SomeNumber: - forStatic i, 0, L-1: - r[i] = x[][i] - else: - forStatic i, 0, L-1: - assign(addr(r[i*N0]), x[][i]) - proc assign*(m: Masked[T], x: SomeNumber) = - #static: echo "a mask" - var i = 0 - var b = m.mask - while b != 0: - if (b and 1) != 0: - m.pobj[][i] = x - b = b shr 1 - i.inc - #static: echo "end a mask" - template add*(r:var T; x:SomeNumber; y:T) = add(r, x.to(type(T)), y) - template add*(r:var T; x:T; y:SomeNumber) = add(r, x, y.to(type(T))) - template sub*(r:var T; x:SomeNumber; y:T) = sub(r, x.to(type(T)), y) - template sub*(r:var T; x:T; y:SomeNumber) = sub(r, x, y.to(type(T))) - template mul*(r:var T; x:SomeNumber; y:T) = mul(r, x.to(type(T)), y) - #map120x(SomeNumber,T,T,L,mul,mul) - template mul*(r:var T; x:T; y:SomeNumber) = mul(r, x, y.to(type(T))) - template iadd*(r:var T; x:SomeNumber) = iadd(r, x.to(type(T))) - template isub*(r:var T; x:SomeNumber) = isub(r, x.to(type(T))) - template imadd*(r:var T; x:SomeNumber; y:T) = imadd(r, x.to(type(T)), y) - template imsub*(r:var T; x:SomeNumber; y:T) = imsub(r, x.to(type(T)), y) - template divd*(r:var T; x:SomeNumber; y:T) = divd(r, x.to(type(T)), y) - template imadd*(r:var T; x:T; y:SomeNumber) = imadd(r, x, y.to(type(T))) - template imsub*(r:var T; x:T; y:SomeNumber) = imsub(r, x, y.to(type(T))) - template divd*(r:var T; x:T; y:SomeNumber) = divd(r, x, y.to(type(T))) - template imul*(r:var T; x:SomeNumber) = imul(r, x.to(type(T))) - template idiv*(r:var T; x:SomeNumber) = idiv(r, x.to(type(T))) - template msub*(r:var T; x:SomeNumber; y,z:T) = msub(r, x.to(type(T)), y, z) - template `:=`*(r:var T; x:array[N,SomeNumber]) = assign(r, x) - template `+`*(x:SomeNumber; y:T):T = add(x.to(type(T)), y) - template `+`*(x:T; y:SomeNumber):T = add(x, y.to(type(T))) - template `-`*(x:SomeNumber; y:T):T = sub(x.to(type(T)), y) - template `-`*(x:T; y:SomeNumber):T = sub(x, y.to(type(T))) - template `*`*(x:SomeNumber; y:T):T = mul(x.to(type(T)), y) - #map021x(SomeNumber,T,T,L,`*`,`*`) - template `*`*(x:T; y:SomeNumber):T = mul(x, y.to(type(T))) - template `/`*(x:SomeNumber; y:T):T = divd(x.to(type(T)), y) - template `/`*(x:T; y:SomeNumber):T = divd(x, y.to(type(T))) - template `<`*(x:T; y:SomeNumber):T = `<`(x, y.to(type(T))) - template `+=`*(r:var T; x:SomeNumber) = iadd(r, x.to(type(T))) - template `-=`*(r:var T; x:SomeNumber) = isub(r, x.to(type(T))) - template `*=`*(r:var T; x:SomeNumber) = imul(r, x.to(type(T))) - template `/=`*(r:var T; x:SomeNumber) = idiv(r, x.to(type(T))) - - proc `$`*(x:T):string = - result = "[" & $x[0] - for i in 1..P: + bind map110 + map110(T, L, F, F) + else: + proc F*(r:var T; x:T) {.inline.} = + const b = (P div N0) and (L-1) + bind forStatic + forStatic i, 0, L-1: + assign(r[][i], x[][i xor b]) +template makePerm(P,T,L,N0) {.dirty.} = + bind evalBacktic, makePermX + evalBacktic: + makePermX(`"perm" P`,P,T,L,N0) +template makePackPX(F,P,T,L,N0) {.dirty.} = + # P: perm, T: vector type, L: outer vec len, N0: inner vec len + when N0>P: + proc F*(r:var openArray[SomeNumber], x:T, + l:var openarray[SomeNumber]) {.inline.} = + const N02 = N0 div 2 + let ra = cast[ptr array[L,array[N02,type(r[0])]]](r[0].addr) + let la = cast[ptr array[L,array[N02,type(l[0])]]](l[0].addr) + bind forStatic + forStatic i, 0, L-1: + F(ra[][i], x[][i], la[][i]) + else: + proc F*(r:var openArray[SomeNumber], x:T, + l:var openarray[SomeNumber]) {.inline.} = + const L2 = L div 2 + let ra = cast[ptr array[L2,array[N0,type(r[0])]]](r[0].addr) + let la = cast[ptr array[L2,array[N0,type(l[0])]]](l[0].addr) + const b = (P div N0) and (L-1) + var ir,il = 0 + bind forStatic + forStatic i, 0, L-1: + if (i and b) == 0: + when N0==1: + la[][il][0] = x[][i] + else: + assign(la[][il], x[][i]) + inc il + else: + when N0==1: + ra[][ir][0] = x[][i] + else: + assign(ra[][ir], x[][i]) + inc ir +template makePackMX(F,P,T,L,N0) {.dirty.} = + # P: perm, T: vector type, L: outer vec len, N0: inner vec len + when N0>P: + proc F*(r:var openArray[SomeNumber], x:T, + l:var openarray[SomeNumber]) {.inline.} = + const N02 = N0 div 2 + let ra = cast[ptr array[L,array[N02,type(r[0])]]](r[0].addr) + let la = cast[ptr array[L,array[N02,type(l[0])]]](l[0].addr) + bind forStatic + forStatic i, 0, L-1: + F(ra[][i], x[][i], la[][i]) + else: + proc F*(r:var openArray[SomeNumber], x:T, + l:var openarray[SomeNumber]) {.inline.} = + const L2 = L div 2 + let ra = cast[ptr array[L2,array[N0,type(r[0])]]](r[0].addr) + let la = cast[ptr array[L2,array[N0,type(l[0])]]](l[0].addr) + const b = (P div N0) and (L-1) + var ir,il = 0 + bind forStatic + forStatic i, 0, L-1: + if (i and b) == 0: + when N0==1: + ra[][ir][0] = x[][i] + else: + assign(ra[][ir], x[][i]) + inc ir + else: + when N0==1: + la[][il][0] = x[][i] + else: + assign(la[][il], x[][i]) + inc il +template makePackP(P,T,L,N0) {.dirty.} = + bind evalBacktic, makePackPX + evalBacktic: + makePackPX(`"packp" P`,P,T,L,N0) +template makePackM(P,T,L,N0) {.dirty.} = + bind evalBacktic, makePackMX + evalBacktic: + makePackMX(`"packm" P`,P,T,L,N0) +template makeBlendPX(F,P,T,L,N0) {.dirty.} = + when N0>P: + proc F*(x:var T; r:openArray[SomeNumber]; + l:openArray[SomeNumber]) {.inline.} = + const N02 = N0 div 2 + let ra = cast[ptr array[L,array[N02,type(r[0])]]](unsafeAddr(r[0])) + let la = cast[ptr array[L,array[N02,type(l[0])]]](unsafeAddr(l[0])) + forStatic i, 0, L-1: + F(x[][i], ra[][i], la[][i]) + else: + proc F*(x:var T; r:openArray[SomeNumber]; + l:openArray[SomeNumber]) {.inline.} = + const L2 = L div 2 + let ra = cast[ptr array[L2,array[N0,type(r[0])]]](unsafeAddr(r[0])) + let la = cast[ptr array[L2,array[N0,type(l[0])]]](unsafeAddr(l[0])) + const b = (P div N0) and (L-1) + var ir,il = 0 + bind forStatic + forStatic i, 0, L-1: + if (i and b) == 0: + when N0==1: + x[][i] = la[][il][0] + else: + assign(x[][i], la[][il]) + inc il + else: + when N0==1: + x[][i] = ra[][ir][0] + else: + assign(x[][i], ra[][ir]) + inc ir +template makeBlendMX(F,P,T,L,N0) {.dirty.} = + when N0>P: + proc F*(x:var T; r:openArray[SomeNumber]; + l:openArray[SomeNumber]) {.inline.} = + const N02 = N0 div 2 + let ra = cast[ptr array[L,array[N02,type(r[0])]]](unsafeAddr(r[0])) + let la = cast[ptr array[L,array[N02,type(l[0])]]](unsafeAddr(l[0])) + bind forStatic + forStatic i, 0, L-1: + F(x[][i], ra[][i], la[][i]) + else: + proc F*(x:var T; r:openArray[SomeNumber]; + l:openArray[SomeNumber]) {.inline.} = + const L2 = L div 2 + let ra = cast[ptr array[L2,array[N0,type(r[0])]]](unsafeAddr(r[0])) + let la = cast[ptr array[L2,array[N0,type(l[0])]]](unsafeAddr(l[0])) + const b = (P div N0) and (L-1) + var ir,il = 0 + bind forStatic + forStatic i, 0, L-1: + if (i and b) == 0: + when N0==1: + x[][i] = ra[][ir][0] + else: + assign(x[][i], ra[][ir]) + inc ir + else: + when N0==1: + x[][i] = la[][il][0] + else: + assign(x[][i], la[][il]) + inc il +template makeBlendP(P,T,L,N0) {.dirty.} = + bind evalBacktic, makeBlendPX + evalBacktic: + makeBlendPX(`"blendp" P`,P,T,L,N0) +template makeBlendM(P,T,L,N0) {.dirty.} = + bind evalBacktic, makeBlendMX + evalBacktic: + makeBlendMX(`"blendm" P`,P,T,L,N0) + +discard """ + proc op*(x:var T; r:openArray[SomeNumber]; + l:openArray[SomeNumber]) {.inline.} = + let ra = cast[ptr array[2,type(r[0])]](unsafeAddr(r[2])) + let la = cast[ptr array[2,type(l[0])]](unsafeAddr(l[2])) + op(x[][0], r, l) + op(x[][1], ra[], la[]) +""" + +#type SimdArrayObj*[L,B] = object +# v*: array[L,B] + +template arrayType[T](N: typed, x: typedesc[T]): untyped = + type B {.gensym.} = T.type + #static: echo "arrayType: ", N, " ", B + array[N,B] +# T = Simd{S,D}{L} = array[L,B] +# B ~ array[N0,F] +template makeSimdArray*(T:untyped;L:typed;B:typedesc):untyped {.dirty.} = +#template makeSimdArray*(L:typed;B:typedesc;T:untyped) {.dirty.} = + #static: echo "makeSimdArray: ", L, " ", $B.type + #makeSimdArray2(T, L, type B, numberType(B), numNumbers(B), L*numNumbers(B)) + makeSimdArray2(L, B, numberType(B), numNumbers(B), L*numNumbers(B), T) +#template makeSimdArray2*(T:untyped;L,B,F,N0,N:typed):untyped {.dirty.} = +#template makeSimdArray2*(T:untyped;L:typed;BB,F:typedesc;N0,N:typed):untyped {.dirty.} = +template makeSimdArray2*(L:typed;B,F:typedesc;N0,N:typed,T:untyped) {.dirty.} = + getOptimPragmas() + #static: echo "makeSimdArray2: ", L, " ", $B + bind map011, map021, map021x, map110, map120, map120x, map130, arrayType + bind makePerm, makePackP, makePackM, makeBlendP, makeBlendM + #type B {.gensym.} = typeof(BB) + #type T* = distinct array[L,B] + type T* = object + #v*: array[L,B.type] + v*: arrayType(L,B) + #v* {.align(L*sizeof(B)).}: arrayType(L,B) + #type T* = SimdArrayObj[L,B] + template isWrapper*(x:typedesc[T]): bool = false + template isWrapper*(x:T): bool = false + template numberType*(x:typedesc[T]):typedesc = F + template numberType*(x:T):typedesc = F + template numNumbers*(x:typedesc[T]):untyped = N + template numNumbers*(x:T):untyped = N + template simdType*(x:typedesc[T]):typedesc = T + template simdType*(x:T):typedesc = T + template simdLength*(x:T):untyped = N + template simdLength*(x:typedesc[T]):untyped = N + template eval*(x:T): untyped = x + template eval*(x:typedesc[T]): typedesc = T + #template `[]`*(x:T):untyped = (array[L,B])(x) + template `[]`*(x:T):untyped = x.v + template `[]=`*(x: T; y: typed) = x.v = y + when B is SomeNumber: + template `[]`*(x:T; i:SomeInteger):untyped = x[][i div N0] + template `[]=`*(x:T; i:SomeInteger; y:auto) = x[][i div N0] := y + else: + template `[]`*(x:T; i:SomeInteger):untyped = x[][i div N0][i mod N0] + template `[]=`*(x:T; i:SomeInteger; y:auto) = x[][i div N0][i mod N0] = y + template load1*(x:T):untyped = x + proc to*(x:SomeNumber; y:typedesc[T]):T {.alwaysInline,noInit.} = + bind forStatic + forStatic i, 0, L-1: + assign(result[][i], x) + #echoAst: F + #static: echo F.treerepr + #when not(F is float32): + # should be handled in simd.nim now + #when F is float64: + # template toDoubleImpl*(x: T): untyped = x + #else: + # template toSingleImpl*(x: T): untyped = x + proc simdReduce*(r: var SomeNumber; x: T) {.inline.} = + #mixin add + var y = x[][0] + forStatic i, 1, L-1: + iadd(y, x[][i]) + r = (type(r))(simdReduce(y)) + proc simdReduce*(x:T):F {.noInit,inline.} = simdReduce(result, x) + template simdSum*(r:var SomeNumber; x:T) = simdReduce(r, x) + template simdSum*(x:T):untyped = simdReduce(x) + proc simdMaxReduce*(r:var SomeNumber; x:T) {.inline.} = + mixin simdMaxReduce + var y = x[][0] + forStatic i, 1, L-1: + let c = x[][i] + y = max(y, c) + r = (type(r))(simdMaxReduce(y)) + proc simdMaxReduce*(x:T):F {.noinit,inline.} = simdMaxReduce(result, x) + template simdMax*(r:var SomeNumber; x:T) = simdMaxReduce(r, x) + template simdMax*(x:T):untyped = simdMaxReduce(x) + proc simdMinReduce*(r:var SomeNumber; x:T) {.inline.} = + mixin simdMinReduce + var y = x[][0] + forStatic i, 1, L-1: + let c = x[][i] + y = min(y, c) + r = (type(r))(simdMinReduce(y)) + proc simdMinReduce*(x:T):F {.noinit,inline.} = simdMinReduce(result, x) + template simdMin*(r:var SomeNumber; x:T) = simdMinReduce(r, x) + template simdMin*(x:T):untyped = simdMinReduce(x) + proc `-`*(x:T):T {.inline,noInit.} = + forStatic i, 0, L-1: + neg(result[][i], x[][i]) + + map011(T, L, abs, abs) + map011(T, L, trace, trace) + map011(T, L, norm2, norm2) + map011(T, L, sqrt, sqrt) + map011(T, L, rsqrt, rsqrt) + map011(T, L, inv, inv) + map011(T, L, sin, sin) + map011(T, L, cos, cos) + map011(T, L, acos, acos) + + map021(T, L, atan2, atan2) + map021(T, L, min, min) + map021(T, L, max, max) + map021(T, L, add, `+`) + map021(T, L, sub, `-`) + map021(T, L, mul, `*`) + map021(T, L, divd, `/`) + map021(T, L, `+`, `+`) + map021(T, L, `-`, `-`) + map021(T, L, `*`, `*`) + map021(T, L, `/`, `/`) + #map021(T, L, `<`, `<`) + map021(T, L, copySign, copySign) + + map110(T, L, assign, assign) + map110(T, L, neg, neg) + map110(T, L, iadd, iadd) + map110(T, L, isub, isub) + map110(T, L, imul, imul) + map110(T, L, idiv, idiv) + map110(T, L, norm2, norm2) + map110(T, L, inorm2, inorm2) + map110(T, L, rsqrt, rsqrt) + map110(T, L, `+=`, iadd) + map110(T, L, `-=`, isub) + map110(T, L, `*=`, imul) + map110(T, L, `/=`, idiv) + + map120(T, L, add, add) + map120(T, L, sub, sub) + map120(T, L, mul, mul) + map120(T, L, divd, divd) + map120(T, L, imadd, imadd) + map120(T, L, imsub, imsub) + + map130(T, L, msub, msub) + + template `:=`*(r: T; x: T): untyped = assign(r, x) + template `:=`*(r: T; x: array[L,B]) = + let xx = x + forStatic i, 0, L-1: + r[][i] = xx[i] + when N==1: + template assign*(r: SomeNumber, x: T): untyped = r = x[0] + proc assign*(r: var T; x: SomeNumber) {.alwaysInline.} = + #assign(r, x.to(T)) + forStatic i, 0, L-1: + assign(r[][i], x) + template `:=`*(r: T, x: SomeNumber) = assign(r, x) + proc assign*(r:var T; x:array[N,SomeNumber]) {.inline.} = + when compiles(assign(r[][0], unsafeAddr(x[0]))): + forStatic i, 0, L-1: + assign(r[][i], unsafeAddr(x[i*N0])) + else: + var y = x + forStatic i, 0, L-1: + assign(r[][i], unsafeAddr(y[i*N0])) + proc assign*(r:var array[N,SomeNumber], x:T) {.inline.} = + bind forStatic + when B is SomeNumber: + forStatic i, 0, L-1: + r[i] = x[][i] + else: + forStatic i, 0, L-1: + assign(addr(r[i*N0]), x[][i]) + proc assign*(m: Masked[T], x: SomeNumber) = + #static: echo "a mask" + var i = 0 + var b = m.mask + while b != 0: + if (b and 1) != 0: + m.pobj[][i] = x + b = b shr 1 + i.inc + #static: echo "end a mask" + template add*(r:var T; x:SomeNumber; y:T) = add(r, x.to(type(T)), y) + template add*(r:var T; x:T; y:SomeNumber) = add(r, x, y.to(type(T))) + template sub*(r:var T; x:SomeNumber; y:T) = sub(r, x.to(type(T)), y) + template sub*(r:var T; x:T; y:SomeNumber) = sub(r, x, y.to(type(T))) + template mul*(r:var T; x:SomeNumber; y:T) = mul(r, x.to(type(T)), y) + #map120x(SomeNumber,T,T,L,mul,mul) + template mul*(r:var T; x:T; y:SomeNumber) = mul(r, x, y.to(type(T))) + template iadd*(r:var T; x:SomeNumber) = iadd(r, x.to(type(T))) + template isub*(r:var T; x:SomeNumber) = isub(r, x.to(type(T))) + template imadd*(r:var T; x:SomeNumber; y:T) = imadd(r, x.to(type(T)), y) + template imsub*(r:var T; x:SomeNumber; y:T) = imsub(r, x.to(type(T)), y) + template divd*(r:var T; x:SomeNumber; y:T) = divd(r, x.to(type(T)), y) + template imadd*(r:var T; x:T; y:SomeNumber) = imadd(r, x, y.to(type(T))) + template imsub*(r:var T; x:T; y:SomeNumber) = imsub(r, x, y.to(type(T))) + template divd*(r:var T; x:T; y:SomeNumber) = divd(r, x, y.to(type(T))) + template imul*(r:var T; x:SomeNumber) = imul(r, x.to(type(T))) + template idiv*(r:var T; x:SomeNumber) = idiv(r, x.to(type(T))) + template msub*(r:var T; x:SomeNumber; y,z:T) = msub(r, x.to(type(T)), y, z) + template `:=`*(r:var T; x:array[N,SomeNumber]) = assign(r, x) + template `+`*(x:SomeNumber; y:T):T = add(x.to(type(T)), y) + template `+`*(x:T; y:SomeNumber):T = add(x, y.to(type(T))) + template `-`*(x:SomeNumber; y:T):T = sub(x.to(type(T)), y) + template `-`*(x:T; y:SomeNumber):T = sub(x, y.to(type(T))) + template `*`*(x:SomeNumber; y:T):T = mul(x.to(type(T)), y) + #map021x(SomeNumber,T,T,L,`*`,`*`) + template `*`*(x:T; y:SomeNumber):T = mul(x, y.to(type(T))) + template `/`*(x:SomeNumber; y:T):T = divd(x.to(type(T)), y) + template `/`*(x:T; y:SomeNumber):T = divd(x, y.to(type(T))) + template `<`*(x:T; y:SomeNumber):T = `<`(x, y.to(type(T))) + template `+=`*(r:var T; x:SomeNumber) = iadd(r, x.to(type(T))) + template `-=`*(r:var T; x:SomeNumber) = isub(r, x.to(type(T))) + template `*=`*(r:var T; x:SomeNumber) = imul(r, x.to(type(T))) + template `/=`*(r:var T; x:SomeNumber) = idiv(r, x.to(type(T))) + + proc `$`*(x:T):string = + result = "[" & $x[0] + for i in 1.. Date: Fri, 15 Nov 2024 22:56:05 -0600 Subject: [PATCH 11/23] merge updates from wrap --- src/physics/spinOld.nim | 10 +++++++--- src/simd.nim | 41 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/src/physics/spinOld.nim b/src/physics/spinOld.nim index 4e73d40..09a8077 100644 --- a/src/physics/spinOld.nim +++ b/src/physics/spinOld.nim @@ -33,7 +33,7 @@ template index*[T,I](x: typedesc[Spin[T]], i: typedesc[I]): typedesc = index(T.type, I.type) template `[]`*[T](x: Spin, i: T): untyped = - when T is Spin: + when T is Spin2: x[][i[]] elif T.isWrapper: #indexed(x, i) @@ -43,8 +43,8 @@ template `[]`*[T](x: Spin, i: T): untyped = x[][i] template `[]`*(x: Spin, i,j: typed): untyped = x[][i,j] #template `[]=`*[T](x: Spin, i: T; y: typed) = -proc `[]=`*[T](x: Spin, i: T; y: auto) = - when T is Spin: +proc `[]=`*[T](x: var Spin, i: T; y: auto) = + when T is Spin2: x[][i[]] = y elif T.isWrapper: #indexed(x, i) @@ -211,6 +211,8 @@ template spinVector*(x: static[int], a: untyped): untyped = #let t = asSpin(t1) #static: echo "spinVector2" #t +template spinVector*[T](x: static[int], a: typedesc[T]): untyped = + asSpin(asVectorArray(x, type T)) template spinMatrix*[T](x,y:static[int], a: untyped): untyped = const I:int = x @@ -315,6 +317,8 @@ const template I(x: typed): untyped = newImag(1)*x +template mI(x: typed): untyped = + newImag(-1)*x proc spproj1p*(r: var auto, x: auto) = ## r: HalfFermion diff --git a/src/simd.nim b/src/simd.nim index 0842aea..dd7420d 100644 --- a/src/simd.nim +++ b/src/simd.nim @@ -2,7 +2,7 @@ #import simdGcc #export simdGcc import base/metaUtils -import math +import math, macros import simd/simdWrap export simdWrap @@ -12,10 +12,17 @@ import base/stdUtils import simd/simdArray export simdArray -template msa(T,N,F: untyped) {.dirty,used.} = - makeSimdArray(`T Obj`, N, F) - type T* = Simd[`T Obj`] +#template msa(T,N,F: untyped) {.dirty,used.} = +template msa(T: untyped, N: static[int], F: typedesc) {.dirty,used.} = + #static: echo "msa: ", N, " ", F.type + #makeSimdArray(N, F, `T Obj`) #template `T Array` = discard + #makeSimdArray(`T Obj`, N, F) + when declared SimdArrayObj: + type `T Obj`* = SimdArrayObj[N,F] + else: + makeSimdArray(`T Obj`, N, F) + type T* = Simd[`T Obj`] type `T Array`* = `T Obj` #static: echo "made type", $T @@ -92,6 +99,8 @@ when true: msa(SimdS2, 2, float32) when not declared(SimdD2): msa(SimdD2, 2, float64) + when not declared(SimdS2Obj): + type SimdS2Obj* = `[]`(SimdS2) when not declared(SimdD2Obj): type SimdD2Obj* = `[]`(SimdD2) @@ -99,6 +108,10 @@ when true: when true: msa(SimdS1, 1, float32) msa(SimdD1, 1, float64) + when not declared(SimdS1Obj): + type SimdS1Obj* = `[]`(SimdS1) + when not declared(SimdD1Obj): + type SimdD1Obj* = `[]`(SimdD1) ## mixed precision assignment @@ -321,4 +334,24 @@ template assignX*(x: var Simd, y: Simd2) = debugType: y assign(x[], y[]) +macro simdObjType*(N: static int, T: typedesc): auto = + #echo T.repr + let p = if T.repr == "float32": "S" else: "D" + result = ident("Simd" & p & $N & "Obj") + #echo result + +type + SimdObjType*[N:static int, T] = simdObjType(N,T) +when not declared SimdArrayObj: + type SimdArrayObj*[N:static int,T] = SimdObjType[N,T] + +#[ +template toDoubleImpl*(x: T): auto = + mixin simdObjType, assign + when F is float64: x + else: + type D = simdObjType(N, float64) + var r {.noInit.}: D + assign(r, x) +]# From d2032ec9e5df56cccf4a6ab9137cef36cd99b450 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Sun, 17 Nov 2024 14:49:30 -0600 Subject: [PATCH 12/23] improve benchmarking --- src/base/globals.nim | 10 +++++++ src/base/profile.nim | 57 ++++++++----------------------------- src/bench/benchLinalg.nim | 12 ++++++-- src/bench/benchStagProp.nim | 4 +-- src/experimental/stagag.nim | 5 +--- src/physics/qcdTypes.nim | 1 - tests/base/tshift.nim | 26 +++++++++++------ 7 files changed, 53 insertions(+), 62 deletions(-) diff --git a/src/base/globals.nim b/src/base/globals.nim index 4932570..428bf6a 100644 --- a/src/base/globals.nim +++ b/src/base/globals.nim @@ -44,3 +44,13 @@ macro setDefaultNc*(n: static[int]): untyped = result = newEmptyNode() macro getDefaultNc*(): untyped = return newLit(defaultNc) + +template getDefPrecStr:string = + const defPrec {.strdefine.} = "D" + defPrec + +var defPrec* {.compiletime.} = getDefPrecStr() +macro setDefaultSingle* = + defPrec = "S" + echo "Default precision: ", defPrec +static: echo "Default precision: ", defPrec diff --git a/src/base/profile.nim b/src/base/profile.nim index f577bec..506568c 100644 --- a/src/base/profile.nim +++ b/src/base/profile.nim @@ -1,6 +1,6 @@ import threading export threading -import comms/comms, stdUtils, base/basicOps +import comms/comms, stdUtils, base/[basicOps,params] import os, strutils, sequtils, std/monotimes, std/tables, std/algorithm, strformat export monotimes getOptimPragmas() @@ -20,7 +20,7 @@ var ##[ -Each Tic starts a local timer. Each Toc records the time difference +Each tic() starts a local timer. Each toc() records the time difference of the current time with the one in the local timer visible in the scope, and then update the timer with the current time. @@ -516,49 +516,10 @@ template tocI(f: SomeNumber; s:SString = ""; n = -1) = localCode {.global.} = newList[CodePoint]() thisCode = CodePoint(-1) if threadNum==0: - when false: - #echo "==== begin toc ",s," ",ii - #echo " rtiStack: ",indent($rtiStack,5) - #echo " cpHeap: ",indent($cpHeap,5) - if unlikely VerboseTimer: echoToc(s,ii) - if prevRTI.int32 >= 0: - if restartTimer: - thawTimers() - restartTimer = false - if not timersFrozen(): - let theTime = getTics() - when not cname: - for c in items(localCode): - if cpHeap[c.int].name.equal(s): - thisCode = c - break - if thisCode.isNil: - thisCode = newCodePoint(ii.addr, s) - when not cname: - localCode.add thisCode - let - ns = theTime-localTimer - thisRTI = record(localTic, prevRTI, thisCode, ns, float(f)) - var oh = rtiStack[thisRTI.int].childrenOverhead - let c = rtiStack[thisRTI.int].children - for i in 0.. DropWasteTimerRatio: - # Signal stop if the overhead is too large. - dropTimer(prevRTI) - if toDropTimer(thisCode): - freezeTimers() - restartTimer = true - localTimer = getTics() - rtiStack[thisRTI.int].overhead = nsec(localTimer-theTime) - prevRTI = thisRTI - #echo "==== end toc ",s," ",ii + when cname: + tocSet(localTimer,prevRTI,restartTimer,thisCode,f,s,ii.addr,localTic,false) else: - when cname: - tocSet(localTimer,prevRTI,restartTimer,thisCode,f,s,ii.addr,localTic,false) - else: - tocSet(localTimer,prevRTI,restartTimer,thisCode,f,s,ii.addr,localTic,addr localCode) + tocSet(localTimer,prevRTI,restartTimer,thisCode,f,s,ii.addr,localTic,addr localCode) when noTicToc: template toc*() = discard @@ -822,7 +783,7 @@ proc makeHotspotTable(lrti: List[RTInfoObj]): tuple[ns:int64,oh:int64] = t.children.add ri.children[i] do: # loc not found hs[loc] = ri - #let tot = makeHotSpotTable(List[RTInfoObj](ri.children)) + let tot = makeHotSpotTable(List[RTInfoObj](ri.children)) return (nstot, ohtot) proc echoHotspots* = @@ -879,6 +840,12 @@ proc echoHotspots* = let tsnspct = 100.0 * tsns / nstot echo &"{pct:6.3f} {tsnspct:7.3f} {count} {mf} {nc} S {lc} {nm}" +proc echoProf*(def = 0) = + case intParam("prof",def) + of 1: echoHotspots() + of 2: echoTimers() + else: discard + when isMainModule: import os proc test = diff --git a/src/bench/benchLinalg.nim b/src/bench/benchLinalg.nim index 68d6144..928fae7 100644 --- a/src/bench/benchLinalg.nim +++ b/src/bench/benchLinalg.nim @@ -45,6 +45,7 @@ proc checkMem = # fps: flops per site # bps: bytes moved (load+store) per site # mm: memory footprint (bytes) per site +var minNrep = int.high template bench(fps,bps,mm,eqn: untyped) {.dirty.} = block: let vol = lo.nSites.float @@ -73,12 +74,19 @@ template bench(fps,bps,mm,eqn: untyped) {.dirty.} = inc nbench echo "bench: ",nbench|(-6), "secs: ", dt|(6,3), " mf: ", mf|7, " mb: ", mb|7, " mem: ", mem, " nrep: ", nrep echo exp2string(eqn), "\n" + minNrep = min(minNrep, nrep) template bench(fps,bps,eqn: untyped) = bench(fps,bps,0,eqn) proc test(lat:auto) = - let maxl = intParam("maxl",16) - if lat[0] > maxl: return + let minl = intParam("minl",0) + if lat[0] < minl: return + let maxl = intParam("maxl",0) + if maxl>0: + if lat[0] > maxl: return + else: + if minNrep == 1: return + minNrep = int.high var lo = newLayout(lat) template newCV: untyped = lo.ColorVector() template newCM: untyped = lo.ColorMatrix() diff --git a/src/bench/benchStagProp.nim b/src/bench/benchStagProp.nim index 2ac4f8f..164758c 100644 --- a/src/bench/benchStagProp.nim +++ b/src/bench/benchStagProp.nim @@ -63,7 +63,7 @@ threads: threadBarrier() echo r.norm2 #echo v2 -echoTimers() +echoProf() var g3:array[8,type(g[0])] for i in 0..3: @@ -76,6 +76,6 @@ var s3 = newStag3(g3) s3.solve(v2, v1, mass, sp) resetTimers() s3.solve(v2, v1, mass, sp) -echoTimers() +echoProf() qexFinalize() diff --git a/src/experimental/stagag.nim b/src/experimental/stagag.nim index e29f934..4574ce9 100644 --- a/src/experimental/stagag.nim +++ b/src/experimental/stagag.nim @@ -2103,8 +2103,5 @@ if outfn != "": echo "Saving gauge field to file: ", outfn let err = g.saveGauge outfn -case intParam("prof",0) -of 1: echoTimers() -of 2: echoHotspots() -else: discard +echoProf() qexFinalize() diff --git a/src/physics/qcdTypes.nim b/src/physics/qcdTypes.nim index 007ca9f..2f0bcef 100644 --- a/src/physics/qcdTypes.nim +++ b/src/physics/qcdTypes.nim @@ -416,7 +416,6 @@ macro makeConstructors(x: untyped): untyped = result = newStmtList() result.add getAst mp(ident(f&"S"), ident("S"&r&"V"), ident"result") result.add getAst mp(ident(f&"D"), ident("D"&r&"V"), ident"result") - const defPrec {.strdefine.} = "D" result.add getAst mp(ident(f), ident(defPrec&r&"V"), ident"result") # non-Simd versions result.add getAst mp(ident(f&"S1"), ident("S"&r), ident"result") diff --git a/tests/base/tshift.nim b/tests/base/tshift.nim index 75d39e5..4ddc04f 100644 --- a/tests/base/tshift.nim +++ b/tests/base/tshift.nim @@ -126,18 +126,20 @@ template testS16(Smd: typedesc) = qexInit() template makeSimdArrayX(T,N,B: untyped) {.dirty.} = - makeSimdArray(`T X`, N, B) - type T = Simd[`T X`] - template toDoubleImpl(x: `T X`): untyped = x # always already double + #makeSimdArray(`T X`, N, B) + #type T = Simd[`T X`] + #template toDoubleImpl(x: `T X`): untyped = x # always already double + type T = Simd[SimdArrayObj[N,B]] #testS1(float) + makeSimdArrayX(SD1, 1, float) testS1(SD1) when declared(SimdD1): testS1(SimdD1) -#makeSimdArrayX(SS1, 1, float32) -#testS1(SS1) +makeSimdArrayX(SS1, 1, float32) +testS1(SS1) when declared(SimdS1): testS1(SimdS1) @@ -146,8 +148,8 @@ testS2(SD2) when declared(SimdD2): testS2(SimdD2) -#makeSimdArrayX(SS2, 2, float32) -#testS2(SS2) +makeSimdArrayX(SS2, 2, float32) +testS2(SS2) when declared(SimdS2): testS2(SimdS2) @@ -155,6 +157,9 @@ makeSimdArrayX(SD4, 4, float) testS4(SD4) when declared(SimdD4): testS4(SimdD4) + +makeSimdArrayX(SS4, 4, float32) +testS4(SS4) when declared(SimdS4): testS4(SimdS4) @@ -162,6 +167,9 @@ makeSimdArrayX(SD8, 8, float) testS8(SD8) when declared(SimdD8): testS8(SimdD8) + +makeSimdArrayX(SS8, 8, float32) +testS8(SS8) when declared(SimdS8): testS8(SimdS8) @@ -169,8 +177,10 @@ makeSimdArrayX(SD16, 16, float) testS16(SD16) when declared(SimdD16): testS16(SimdD16) + +makeSimdArrayX(SS16, 16, float32) +testS16(SS16) when declared(SimdS16): testS16(SimdS16) - qexFinalize() From 5d6b7b47a61651bc116484fb35a91f7966f5a268 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Tue, 19 Nov 2024 15:19:52 -0600 Subject: [PATCH 13/23] fix threading issue fix issue when doing multiple hotspot outputs add single precision versions of some benchmarks --- src/base/backtrace.nim | 3 +- src/base/omp.nim | 11 ++ src/base/profile.nim | 1 + src/base/stdUtils.nim | 13 ++- src/base/threading.nim | 218 +++++++++++++++++++++++++++-------- src/bench/benchLinalgS.nim | 3 + src/bench/benchStagPropS.nim | 3 + src/comms/commsUtils.nim | 30 ++++- src/examples/staghmc_sh.nim | 3 +- src/layout/shifts.nim | 3 +- 10 files changed, 222 insertions(+), 66 deletions(-) create mode 100644 src/bench/benchLinalgS.nim create mode 100644 src/bench/benchStagPropS.nim diff --git a/src/base/backtrace.nim b/src/base/backtrace.nim index 0a75dbe..d393cd8 100644 --- a/src/base/backtrace.nim +++ b/src/base/backtrace.nim @@ -8,7 +8,7 @@ proc backtrace_symbols(buffer: ptr pointer, size: cint): ptr UncheckedArray[cstr proc print_trace = #void *array[10]; - const nmax = 10 + const nmax = 20 var arr: array[nmax, pointer] #char **strings; @@ -22,6 +22,7 @@ proc print_trace = proc sigtrace(sig: cint) {.noconv.} = print_trace() + quit() proc setTrace* = c_signal(SIGSEGV, sigtrace) diff --git a/src/base/omp.nim b/src/base/omp.nim index 80b803b..52b2c3f 100644 --- a/src/base/omp.nim +++ b/src/base/omp.nim @@ -28,6 +28,11 @@ else: #forceOmpOn() #{. emit:["#pragma omp ", p] .} {. emit:["_Pragma(\"omp ", p, "\")"] .} + template ompPragma(p:string,body:typed) = + {. push stackTrace:off, lineTrace:off, line_dir:off .} + {. emit:["_Pragma(\"omp ", p, "\")"] .} + body + {. pop .} template ompBlock*(p:string; body:untyped) = #{. emit:"#pragma omp " & p .} #{. emit:"{ /* Inserted by ompBlock " & p & " */".} @@ -38,6 +43,12 @@ else: #{. emit:"} /* End ompBlock " & p & " */".} template ompBarrier* = ompPragma("barrier") +template ompFlush* = ompPragma("flush") +template ompFlushAcquire* = ompPragma("flush acquire") +template ompFlushRelease* = ompPragma("flush release") +template ompFlushSeqCst* = ompPragma("flush seq_cst") +template ompAtomicRead*(body) = ompPragma("atomic read acquire", body) +template ompAtomicWrite*(body) = ompPragma("atomic write release", body) template ompParallel*(body:untyped) = ompBlock("parallel"): diff --git a/src/base/profile.nim b/src/base/profile.nim index 506568c..160f041 100644 --- a/src/base/profile.nim +++ b/src/base/profile.nim @@ -787,6 +787,7 @@ proc makeHotspotTable(lrti: List[RTInfoObj]): tuple[ns:int64,oh:int64] = return (nstot, ohtot) proc echoHotspots* = + hs.clear let tot = makeHotspotTable(rtiStack) #let nstot = tot.ns let ohtot = tot.oh diff --git a/src/base/stdUtils.nim b/src/base/stdUtils.nim index e0430c3..5da5f26 100644 --- a/src/base/stdUtils.nim +++ b/src/base/stdUtils.nim @@ -8,10 +8,11 @@ type #template `[]`*(x: cArray): untyped = addr x[0] #template `&`*(x: ptr cArray): untyped = addr x[0] -template ptrInt*(x:untyped):untyped = cast[ByteAddress](x) -template addrInt*(x:untyped):untyped = cast[ByteAddress](addr(x)) -template unsafeAddrInt*(x:untyped):untyped = cast[ByteAddress](addr(x)) -template toHex*(x: ptr typed): untyped = toHex(cast[ByteAddress](x)) +template ptrInt*(x: auto): auto = cast[uint](x) +template addrInt*(x: auto): auto = cast[uint](addr(x)) +template unsafeAddrInt*(x: auto): auto = cast[uint](addr(x)) +template toHex*(x: ptr auto): auto = toHex(cast[uint](x)) +template toHex*(x: pointer): auto = toHex(cast[uint](x)) type ConstInt* {.importc:"const int".} = object @@ -31,7 +32,7 @@ proc isInteger*(s: string):bool = var t:int parseInt(s, t) == s.len -template `$&`*(x: untyped): string = +template `$&`*(x: auto): string = toHex(unsafeAddrInt(x)) proc `|`*(s: string, d: tuple[w:int,c:char]): string = @@ -52,7 +53,7 @@ proc `|`*(f: SomeFloat, d: tuple[w,p: int]): string = formatFloat(f, ffDefault, d.p) | d.w proc `|`*(f: float, d: int): string = f | (d,d) -template `|-`*(x:SomeNumber, y: int): untyped = +template `|-`*(x:SomeNumber, y: int): auto = x | -y proc indexOf*[T](x: openArray[T], y: auto): int = diff --git a/src/base/threading.nim b/src/base/threading.nim index 73db56c..8a01205 100644 --- a/src/base/threading.nim +++ b/src/base/threading.nim @@ -4,6 +4,8 @@ import stdUtils import macros import omp import metaUtils +import base/basicOps +getOptimPragmas type ThreadShare* = object @@ -18,19 +20,34 @@ var threadNum*{.threadvar.}:int var numThreads*{.threadvar.}:int var threadLocals*{.threadvar.}:ThreadObj var inited = false +var ts: pointer = nil +var nts = 0 -template initThreadLocals*(ts:seq[ThreadShare]):untyped = +proc allocTs* {.alwaysInline.} = + if numThreads > nts and threadNum == 0: + if ts == nil: + ts = allocShared(numThreads*sizeof(ThreadShare)) + else: + ts = reallocShared(ts, numThreads*sizeof(ThreadShare)) + nts = numThreads + +#template initThreadLocals*(ts:seq[ThreadShare]) = +template initThreadLocals* = + bind ts threadLocals.threadNum = threadNum threadLocals.numThreads = numThreads - threadLocals.share = cast[ptr cArray[ThreadShare]](ts[0].addr) + #threadLocals.share = cast[ptr cArray[ThreadShare]](ts[0].addr) + threadLocals.share = cast[ptr cArray[ThreadShare]](ts) threadLocals.share[threadNum].p = nil threadLocals.share[threadNum].counter = 0 proc init = inited = true threadNum = 0 numThreads = 1 - var ts = newSeq[ThreadShare](numThreads) - initThreadLocals(ts) + #var ts = newSeq[ThreadShare](numThreads) + #initThreadLocals(ts) + allocTs() + initThreadLocals() template threadsInit* = if not inited: init() @@ -57,51 +74,61 @@ template emitStackTrace: untyped = template threads*(body:untyped):untyped = checkInit() doAssert(numThreads==1) - let tidOld = threadNum - let nidOld = numThreads - let tlOld = threadLocals + #let tidOld = threadNum + #let nidOld = numThreads + #let tlOld = threadLocals #proc tproc2{.genSym,inline.} = # body proc tproc{.genSym.} = emitStackTrace() - var ts:seq[ThreadShare] + #var ts:seq[ThreadShare] ompParallel: threadNum = ompGetThreadNum() numThreads = ompGetNumThreads() - if threadNum==0: ts.newSeq(numThreads) + #if threadNum==0: ts.newSeq(numThreads) + allocTs() threadBarrierO() - initThreadLocals(ts) + #initThreadLocals(ts) + initThreadLocals() threadBarrierO() #echoAll threadNum, " s: ", ptrInt(threadLocals.share) body #tproc2() threadBarrierO() tproc() - threadNum = tidOld - numThreads = nidOld - threadLocals = tlOld + #threadNum = tidOld + #numThreads = nidOld + #threadLocals = tlOld + threadNum = 0 + numThreads = 1 + initThreadLocals() template threads*(x0:untyped;body:untyped):untyped = checkInit() - let tidOld = threadNum - let nidOld = numThreads - let tlOld = threadLocals + #let tidOld = threadNum + #let nidOld = numThreads + #let tlOld = threadLocals proc tproc(xx:var type(x0)) {.genSym.} = - var ts:seq[ThreadShare] + #var ts:seq[ThreadShare] ompParallel: threadNum = ompGetThreadNum() numThreads = ompGetNumThreads() - if threadNum==0: ts.newSeq(numThreads) + #if threadNum==0: ts.newSeq(numThreads) + allocTs() threadBarrierO() - initThreadLocals(ts) + #initThreadLocals(ts) + initThreadLocals() threadBarrierO() #echoAll threadNum, " s: ", ptrInt(threadLocals.share) subst(x0,xx): body threadBarrierO() tproc(x0) - threadNum = tidOld - numThreads = nidOld - threadLocals = tlOld + #threadNum = tidOld + #numThreads = nidOld + #threadLocals = tlOld + threadNum = 0 + numThreads = 1 + initThreadLocals() template nothreads*(body: untyped): untyped = ## convenient way to turn off threading @@ -114,13 +141,18 @@ template threadBarrierO* = ompBarrier template threadMaster*(x:untyped) = ompMaster(x) template threadSingle*(x:untyped) = ompSingle(x) template threadCritical*(x:untyped) = ompCritical(x) +template threadFlush* = ompFlush +template threadFlushRelease* = ompFlushRelease +template threadFlushAcquire* = ompFlushAcquire +template threadFlushSeqCst* = ompFlushSeqCst +template threadAtomicRead*(body:typed) = ompAtomicRead(body) +template threadAtomicWrite*(body:typed) = ompAtomicWrite(body) template threadDivideLow*(x,y: untyped): untyped = x + (threadNum*(y-x)) div numThreads template threadDivideHigh*(x,y: untyped): untyped = x + ((threadNum+1)*(y-x)) div numThreads - proc tForX*(index,i0,i1,body:NimNode):NimNode = return quote do: let d = 1+`i1` - `i0` @@ -154,39 +186,123 @@ iterator `.|`*[S, T](a: S, b: T): T {.inline.} = inc(res) """ -template t0waitO* = threadBarrier() -template t0waitX* = - if threadNum==0: - inc threadLocals.share[0].counter - let tbar0 = threadLocals.share[0].counter - for b in 1.. 1: + if threadNum==0: + inc threadLocals.share[0].counter + let tbar0 = threadLocals.share[0].counter + for b in 1..= tbar0: break + var t {.noInit.}: type(p[]) + ompAtomicRead: t = p[] + if t >= tbar0: break + else: + #inc threadLocals.share[threadNum].counter + #fence() + #ompRelease + let t = threadLocals.share[threadNum].counter + 1 + ompAtomicWrite: + threadLocals.share[threadNum].counter = t +template t0wait* = t0waitA() +#template t0wait* = t0waitB() + +template twait0B* = threadBarrier() +template twait0A* = + if numThreads > 1: + if threadNum==0: + #inc threadLocals.share[0].counter + #fence() + #ompRelease + let t = threadLocals.share[0].counter + 1 + ompAtomicWrite: + threadLocals.share[0].counter = t + else: + inc threadLocals.share[threadNum].counter + let tbar0 = threadLocals.share[threadNum].counter + let p = threadLocals.share[0].counter.addr while true: - fence() - if p[] >= tbar0: break - else: - inc threadLocals.share[threadNum].counter - fence() -template t0wait* = t0waitO() + #fence() + #ompAcquire + #if p[] >= tbar0: break + var t {.noInit.}: type(p[]) + ompAtomicRead: t = p[] + if t >= tbar0: break +template twait0* = twait0A() +#template twait0* = twait0B() + +template threadBarrierA* = + threadFlushRelease + t0waitA + twait0A + threadFlushAcquire +template threadBarrier* = threadBarrierA +#template threadBarrier* = ompBarrier -template twait0O* = threadBarrier() -template twait0X* = +template threadSum01A*[T](a: T) = + ## sum value with result on thread 0, atomic version if threadNum==0: - inc threadLocals.share[0].counter - fence() + tic("threadSum01A") + t0wait() + toc("t0wait") + for i in 1..= tbar0: break -template twait0* = twait0O() + threadAtomicWrite: + threadLocals.share[threadNum].p = a.addr + t0wait() + twait0() + +template threadSum01B*[T](a: T) = + ## sum value with result on thread 0, barrier version + block: + tic("threadSum01") + if threadNum!=0: + threadLocals.share[threadNum].p = a.addr + threadBarrier() + toc("threadBarrier first") + if threadNum==0: + for i in 1.. Date: Wed, 20 Nov 2024 22:42:28 -0600 Subject: [PATCH 14/23] improve threading --- src/base/profile.nim | 3 +- src/base/threading.nim | 97 +++++++++++++++++++++++++++++++++++-- src/bench/benchStagProp.nim | 2 + 3 files changed, 97 insertions(+), 5 deletions(-) diff --git a/src/base/profile.nim b/src/base/profile.nim index 160f041..6a861f6 100644 --- a/src/base/profile.nim +++ b/src/base/profile.nim @@ -15,7 +15,8 @@ template ticDiffSecs*(x,y: TicType): float = 1e-9 * float(x.int64 - y.int64) template `-`*(x,y: TicType): TicType = TicType(x.int64 - y.int64) var - DropWasteTimerRatio* = 0.05 ## Drop children timers if the proportion of their overhead is larger than this. + ## Drop children timers if the proportion of their overhead is larger than this. + DropWasteTimerRatio* = floatParam("dropRatio", 0.05) VerboseTimer* = false ## If true print out all the timers during execution. ##[ diff --git a/src/base/threading.nim b/src/base/threading.nim index 8a01205..e4acf10 100644 --- a/src/base/threading.nim +++ b/src/base/threading.nim @@ -5,15 +5,18 @@ import macros import omp import metaUtils import base/basicOps +import bitops getOptimPragmas type ThreadShare* = object p*:pointer counter*:int + extra*:int ThreadObj* = object threadNum*:int numThreads*:int + #counter*: int share*:ptr cArray[ThreadShare] var threadNum*{.threadvar.}:int @@ -209,8 +212,29 @@ template t0waitA* = let t = threadLocals.share[threadNum].counter + 1 ompAtomicWrite: threadLocals.share[threadNum].counter = t -template t0wait* = t0waitA() +template t0waitC* = + if numThreads > 1: + if threadNum==0: + inc threadLocals.share[0].counter + let tbar0 = threadLocals.share[0].counter + var left = numThreads - 1 + for i in 1.. 0: + for i in 1.. 0: + let p = threadLocals.share[i].counter.addr + var t {.noInit.}: type(p[]) + ompAtomicRead: t = p[] + if t >= tbar0: + threadLocals.share[i].extra = 0 + dec left + else: + let t = threadLocals.share[threadNum].counter + 1 + ompAtomicWrite: + threadLocals.share[threadNum].counter = t +#template t0wait* = t0waitA() #template t0wait* = t0waitB() +template t0wait* = t0waitC() template twait0B* = threadBarrier() template twait0A* = @@ -244,10 +268,10 @@ template threadBarrierA* = template threadBarrier* = threadBarrierA #template threadBarrier* = ompBarrier -template threadSum01A*[T](a: T) = +template threadSum01A0*[T](a: T) = ## sum value with result on thread 0, atomic version if threadNum==0: - tic("threadSum01A") + tic("threadSum01A0") t0wait() toc("t0wait") for i in 1..= tbar0: break + var p{.noInit.}: pointer + threadAtomicRead: + p = threadLocals.share[b].p + a += cast[ptr T](p)[] + toc("sum") + twait0() + toc("twait0") + else: + threadAtomicWrite: + threadLocals.share[threadNum].p = a.addr + let t = threadLocals.share[threadNum].counter + 1 + threadAtomicWrite: + threadLocals.share[threadNum].counter = t + twait0() + template threadSum01B*[T](a: T) = ## sum value with result on thread 0, barrier version block: @@ -279,15 +330,53 @@ template threadSum01B*[T](a: T) = threadBarrier() toc("threadBarrier last") +template threadSum01T*[T](a: T) = + ## sum value with result on thread 0, tree version + threadAtomicWrite: + threadLocals.share[threadNum].p = a.addr + var c = threadLocals.share[threadNum].counter + var done = false + var b = 1 + while b < numThreads: + inc c + if not done: + let o = bitxor(threadNum, b) + if bitand(threadNum, b) == 0: + if o < numThreads: + while true: + var d{.noInit.}: int + threadAtomicRead: + d = threadLocals.share[o].counter + if d >= c: break + var p{.noInit.}: pointer + threadAtomicRead: + p = threadLocals.share[o].p + a += cast[ptr T](p)[] + threadAtomicWrite: + threadLocals.share[threadNum].counter = c + else: + threadAtomicWrite: + threadLocals.share[threadNum].counter = c + while true: + var d{.noInit.}: int + threadAtomicRead: + d = threadLocals.share[o].counter + if d >= c: break + done = true + else: + threadLocals.share[threadNum].counter = c + b *= 2 + template threadSum01*(a: auto) = threadSum01A(a) #template threadSum01*(a: auto) = threadSum01B(a) +#template threadSum01*(a: auto) = threadSum01T(a) template threadSum0*(a: auto) = threadSum01(a) # threadMax0 FIXME template threadBroadcast1A*[T](a: T) = if threadNum==0: - tic("threadRankSum1") + tic("threadBroadcast1A") threadAtomicWrite: threadLocals.share[0].p = a.addr twait0() diff --git a/src/bench/benchStagProp.nim b/src/bench/benchStagProp.nim index 164758c..4a51836 100644 --- a/src/bench/benchStagProp.nim +++ b/src/bench/benchStagProp.nim @@ -71,6 +71,8 @@ for i in 0..3: g3[2*i+1] = lo.ColorMatrix() g3[2*i+1].randomSU rs g3[2*i+1] *= 0.1 +for i in 0.. Date: Thu, 21 Nov 2024 14:01:02 -0600 Subject: [PATCH 15/23] fix echo in profile --- src/base/profile.nim | 2 +- src/comms/comms.nim | 10 +---- src/comms/commsEcho.nim | 86 ++++++++++++++++++++++++++++++++++++++++ src/comms/commsTypes.nim | 8 ++++ src/comms/commsUtils.nim | 83 -------------------------------------- 5 files changed, 97 insertions(+), 92 deletions(-) create mode 100644 src/comms/commsEcho.nim diff --git a/src/base/profile.nim b/src/base/profile.nim index 6a861f6..de7ca62 100644 --- a/src/base/profile.nim +++ b/src/base/profile.nim @@ -1,6 +1,6 @@ import threading export threading -import comms/comms, stdUtils, base/[basicOps,params] +import comms/commsEcho, stdUtils, base/[basicOps,params] import os, strutils, sequtils, std/monotimes, std/tables, std/algorithm, strformat export monotimes getOptimPragmas() diff --git a/src/comms/comms.nim b/src/comms/comms.nim index 9e4be9a..28e3be0 100644 --- a/src/comms/comms.nim +++ b/src/comms/comms.nim @@ -1,14 +1,6 @@ import commsTypes export commsTypes -# globals - -var defaultComm*: Comm -template getDefaultComm*(): Comm = defaultComm -template getComm*(): Comm = getDefaultComm() # temporary alias -var myRank* = 0 -var nRanks* = 1 - # base methods method name*(c: Comm): string {.base.} = discard @@ -157,6 +149,8 @@ commsNames.add "QMP" commsInits.add getQmpComm commsFinis.add commsFinalizeQmp +import commsEcho +export commsEcho import commsUtils export commsUtils diff --git a/src/comms/commsEcho.nim b/src/comms/commsEcho.nim new file mode 100644 index 0000000..b06c68e --- /dev/null +++ b/src/comms/commsEcho.nim @@ -0,0 +1,86 @@ +import macros +import base/[threading,metaUtils] +import commsTypes + +proc evalArgs*(call:var NimNode; args:NimNode):NimNode = + result = newStmtList() + for i in 0..".} +#proc printfOrdered( +macro printf*(fmt:string; args:varargs[untyped]):auto = + var call = newCall(ident("cprintf"), fmt) + result = evalArgs(call, args) + result.add(quote do: + if myRank==0 and threadNum==0: + `call` + ) +proc echoRaw*(x: varargs[typed, `$`]) {.magic: "Echo".} +macro echoAll*(args:varargs[untyped]):auto = + var call = newCall(bindSym"echoRaw") + result = evalArgs(call, args) + result.add(quote do: + `call` + ) +macro echoRank*(args:varargs[untyped]):auto = + var call = newCall(bindSym"echoRaw") + call.add ident"myRank" + call.add newLit"/" + call.add ident"nRanks" + call.add newLit": " + result = evalArgs(call, args) + template f(x:untyped):untyped = + if threadNum==0: x + result.add getAst(f(call)) +macro echo0*(args: varargs[untyped]): untyped = + var call = newCall(bindSym"echoRaw") + result = evalArgs(call, args) + result.add(quote do: + bind myRank + if myRank==0 and threadNum==0: + `call` + ) + #echo result.repr +macro makeEchos(n:static[int]): untyped = + template ech(x,y: untyped) = + template echo* = + when nimvm: + x + else: + y + result = newStmtList() + for i in 1..n: + var er = newCall(bindSym"echoRaw") + var e0 = newCall(bindSym"echo0") + var ea = newSeq[NimNode](0) + for j in 1..i: + let ai = ident("a" & $j) + er.add ai + e0.add ai + ea.add newNimNode(nnkIdentDefs).add(ai).add(ident"untyped").add(newEmptyNode()) + var t = getAst(ech(er,e0)).peelStmt + #echo t.treerepr + for j in 0..".} -#proc printfOrdered( -macro printf*(fmt:string; args:varargs[untyped]):auto = - var call = newCall(ident("cprintf"), fmt) - result = evalArgs(call, args) - result.add(quote do: - if myRank==0 and threadNum==0: - `call` - ) -proc echoRaw*(x: varargs[typed, `$`]) {.magic: "Echo".} -macro echoAll*(args:varargs[untyped]):auto = - var call = newCall(bindSym"echoRaw") - result = evalArgs(call, args) - result.add(quote do: - `call` - ) -macro echoRank*(args:varargs[untyped]):auto = - var call = newCall(bindSym"echoRaw") - call.add ident"myRank" - call.add newLit"/" - call.add ident"nRanks" - call.add newLit": " - result = evalArgs(call, args) - template f(x:untyped):untyped = - if threadNum==0: x - result.add getAst(f(call)) -macro echo0*(args: varargs[untyped]): untyped = - var call = newCall(bindSym"echoRaw") - result = evalArgs(call, args) - result.add(quote do: - bind myRank - if myRank==0 and threadNum==0: - `call` - ) - #echo result.repr -macro makeEchos(n:static[int]): untyped = - template ech(x,y: untyped) = - template echo* = - when nimvm: - x - else: - y - result = newStmtList() - for i in 1..n: - var er = newCall(bindSym"echoRaw") - var e0 = newCall(bindSym"echo0") - var ea = newSeq[NimNode](0) - for j in 1..i: - let ai = ident("a" & $j) - er.add ai - e0.add ai - ea.add newNimNode(nnkIdentDefs).add(ai).add(ident"untyped").add(newEmptyNode()) - var t = getAst(ech(er,e0)).peelStmt - #echo t.treerepr - for j in 0.. Date: Thu, 21 Nov 2024 14:02:12 -0600 Subject: [PATCH 16/23] add version info --- src/base/version.nim | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 src/base/version.nim diff --git a/src/base/version.nim b/src/base/version.nim new file mode 100644 index 0000000..66a47f8 --- /dev/null +++ b/src/base/version.nim @@ -0,0 +1,22 @@ +import strutils, times + +const sep = '='.repeat(78) +const head = "QEX Compilation information:" +const comptime = " Time: " & staticExec("date") +const compmach = " Host: " & staticExec("uname -a") +const gitlog = " Log: " & staticExec("git log -1").indent(7).strip +const gitstat = " Status: " & staticExec("git status -uno").indent(10).strip +const buildInfo = [sep,head,comptime,compmach,gitlog,gitstat,sep].join("\n") +const gitrev = staticExec("git rev-parse HEAD") + +static: echo buildInfo + +proc getBuildInfo*(): string = + buildInfo + +proc getVersion*(): string = + gitrev + +when isMainModule: + echo getBuildInfo() + echo getVersion() From b46d18cb5f2b10fffe640a5db7bcdd9cb9f3055f Mon Sep 17 00:00:00 2001 From: James Osborn Date: Thu, 21 Nov 2024 14:04:44 -0600 Subject: [PATCH 17/23] allow setting of hwloc library --- src/hwloc/capi.nim | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/hwloc/capi.nim b/src/hwloc/capi.nim index 51cfa6f..f104dcb 100644 --- a/src/hwloc/capi.nim +++ b/src/hwloc/capi.nim @@ -48,9 +48,10 @@ ## when hostOs == "macosx": - {.pragma: hwloc, dynlib:"libhwloc.dylib".} + const hwloclib {.strDefine.} = "libhwloc.dylib" else: - {.pragma: hwloc, dynlib:"libhwloc.so".} + const hwloclib {.strDefine.} = "libhwloc.so" +{.pragma: hwloc, dynlib:hwloclib.} type hwloc_bitmap_s {.bycopy.} = object From e5ed8b0a89db86101904a8cc6d1a04d9aca1f90b Mon Sep 17 00:00:00 2001 From: James Osborn Date: Sat, 23 Nov 2024 16:14:14 -0600 Subject: [PATCH 18/23] small update to simd types --- src/simd/simdX86Ops.nim | 8 ++++---- src/simd/simdX86Types.nim | 39 +++++++++++++++++++++++++++++++++------ 2 files changed, 37 insertions(+), 10 deletions(-) diff --git a/src/simd/simdX86Ops.nim b/src/simd/simdX86Ops.nim index 4edd437..5f6d179 100644 --- a/src/simd/simdX86Ops.nim +++ b/src/simd/simdX86Ops.nim @@ -17,22 +17,22 @@ template int2mask*(T: typedesc[m512], i: SomeInteger): mmask16 = cvtu32_mask16(u proc `[]=`*(r:var m128; i:SomeInteger; x:SomeNumber) {.alwaysInline.} = mixin toArray var a = toArray(r) - a[i] := x + a[i] = float32 x assign(r, a) proc `[]=`*(r:var m128d; i:SomeInteger; x:SomeNumber) {.alwaysInline.} = mixin toArray var a = toArray(r) - a[i] := x + a[i] = float x assign(r, a) proc `[]=`*(r:var m256; i:SomeInteger; x:SomeNumber) {.alwaysInline.} = mixin toArray var a = toArray(r) - a[i] := x + a[i] = float32 x assign(r, a) proc `[]=`*(r:var m256d; i:SomeInteger; x:SomeNumber) {.alwaysInline.} = mixin toArray var a = toArray(r) - a[i] := x + a[i] = float x assign(r, a) #proc `[]=`*(r:var m256d; i:SomeInteger; x:SomeNumber) {.alwaysInline.} = # var a {.noInit.}: m256d diff --git a/src/simd/simdX86Types.nim b/src/simd/simdX86Types.nim index bed968b..f1b001e 100644 --- a/src/simd/simdX86Types.nim +++ b/src/simd/simdX86Types.nim @@ -2,25 +2,52 @@ import simdWrap export simdWrap {.pragma: imm, header:"immintrin.h".} +{.pragma: imms, header:"immintrin.h", incompleteStruct.} +type + m64* {.importc: "__m64" , imms.} = object + m128* {.importc: "__m128" , imms.} = object + m128d* {.importc: "__m128d", imms.} = object + m128i* {.importc: "__m128i", imms.} = object + m128h* = distinct int64 + m256* {.importc: "__m256" , imms.} = object + m256d* {.importc: "__m256d", imms.} = object + m256i* {.importc: "__m256i", imms.} = object + m256h* = distinct m128i + m512* {.importc: "__m512" , imms.} = object + m512d* {.importc: "__m512d", imms.} = object + m512i* {.importc: "__m512i", imms.} = object + m512h* = distinct m256i + mmask8* {.importc: "__mmask8" , imms.} = object + mmask16* {.importc: "__mmask16", imms.} = object + mmask32* {.importc: "__mmask32", imms.} = object + mmask64* {.importc: "__mmask64", imms.} = object +#[ +{.pragma: imm, header:"immintrin.h", incompleteStruct.} type m64* {.importc: "__m64" , imm.} = object - m128* {.importc: "__m128" , imm.} = object - m128d* {.importc: "__m128d", imm.} = object + a: array[2,float32] + m128* {.importc: "__m128" , imm.} = distinct array[4,float32] + m128d* {.importc: "__m128d", imm.} = distinct array[2,float64] m128i* {.importc: "__m128i", imm.} = object + a: array[4,int32] m128h* = distinct int64 m256* {.importc: "__m256" , imm.} = object - m256d* {.importc: "__m256d", imm.} = object + a: array[8,float32] + m256d* {.importc: "__m256d", imm.} = distinct array[4,float64] m256i* {.importc: "__m256i", imm.} = object + a: array[8,int32] m256h* = distinct m128i - m512* {.importc: "__m512" , imm.} = object - m512d* {.importc: "__m512d", imm.} = object - m512i* {.importc: "__m512i", imm.} = object + m512* {.importc: "__m512" , imm.} = distinct array[16,float32] + m512d* {.importc: "__m512d", imm.} = distinct array[8,float64] + m512i* {.importc: "__m512i", imm.} = distinct array[16,int32] m512h* = distinct m256i mmask8* {.importc: "__mmask8" , imm.} = object mmask16* {.importc: "__mmask16", imm.} = object mmask32* {.importc: "__mmask32", imm.} = object mmask64* {.importc: "__mmask64", imm.} = object +]# +type SimdX86S* = m64 | m128 | m256 | m512 SimdX86D* = m128d | m256d | m512d SimdX86* = SimdX86S | SimdX86D From 01128e27c9d309b2b37953794558d4b2e78d8e50 Mon Sep 17 00:00:00 2001 From: James Osborn Date: Mon, 2 Dec 2024 23:14:33 -0600 Subject: [PATCH 19/23] include compile-time parameters in help small update to random generators --- src/base/params.nim | 57 ++++++++++++++++++++++++++----------- src/base/stdUtils.nim | 7 +++++ src/base/threading.nim | 2 +- src/bench/benchStagProp.nim | 3 ++ src/layout/layoutX.nim | 13 +++++++-- src/rng/distribution.nim | 2 +- src/rng/generator.nim | 6 ++++ src/rng/milcrng.nim | 18 ++++++++---- src/rng/mrg32k3a.nim | 12 ++++++-- 9 files changed, 90 insertions(+), 30 deletions(-) diff --git a/src/base/params.nim b/src/base/params.nim index dd779a1..7f896e3 100644 --- a/src/base/params.nim +++ b/src/base/params.nim @@ -1,14 +1,12 @@ import macros, os, strUtils, tables, algorithm, seqUtils -# Note: may get imported before 'echo' is redefined -# uses of 'echo' should be in a template with 'mixin echo' # TODO # include CT in saveParams RT -# include CT in help # check used # only write from rank 0 # only read from rank 0 # add way to represent empty seq/string (versus unset/unknown) +# sort paramHelp output by file and option name type ParamListT = object # used to store set parameters from cmd line or file @@ -28,7 +26,8 @@ var loadParamsCommand {.compileTime.} = "loadParams" # normally not changed var params: Table[string,ParamListT] # store params during setup # param info generated at compile time, as setParam commands are compiled var paramInfoCT {.compileTime.} = newSeq[ParamObj](0) -var paramInfoCTRT = newSeq[ParamObj](0) # conversion of paramInfoCT to RT variable +#var paramInfoCTRT = newSeq[ParamObj](0) # conversion of paramInfoCT to RT variable +var paramInfoCTRT: seq[ParamObj] # conversion of paramInfoCT to RT variable # param info generated at run time, as setParam commands executed (using set values) var paramInfo = newSeq[ParamObj](0) @@ -218,13 +217,16 @@ proc saveParams(x: seq[ParamObj], fn: string, loc: string, thisfile: string) = proc addParam(s,r: string, c: string = "", ii: IiType) = paramInfo.set(s, r, c, ii.filename, ii.line) -proc addComment(s,c:string):string = +let spcName = " " +let spcValue = " #" +let spcLoc = " ## " +proc addComment(s,c,spc:string):string = result = s if c.len>0: let - spc = " ## " m = min(s.len, spc.len-8) result &= spc[m..^1] & c +proc addComment(s,c:string):string = addComment(s,c,spcValue) proc echoParamsX*(warnUnknown=false):string = result = "Params:\n" @@ -243,6 +245,8 @@ template echoParams*(warnUnknown=false) = echo echoParamsX(warnUnknown) proc paramHelp*(p:string = ""):string = + #echo paramInfo + #echo paramInfoCTRT result = "Usage:\n " & getAppFileName() var t: ParamObj var i = paramInfo.find p @@ -256,10 +260,21 @@ proc paramHelp*(p:string = ""):string = result &= addComment(" -" & p & ":" & t.value & " (current value)", t.comment) else: result &= " -OPTION:VALUE ...\nAvailable OPTIONs and current VALUEs:" - let spc = " " - for t in paramInfo: + template p(t) = let nm = t.name - result &= "\n " & (nm & spc[min(spc.len-1,nm.len)..^1] & " : " & t.value).addComment(t.comment) + var s = nm & spcName[min(spcName.len-1,nm.len)..^1] & " : " & t.value + let f = t.file.splitfile[1] + let c = f & "," & $t.line + s = s.addComment(c,spcValue) + s = s.addComment(t.comment,spcLoc) + result &= "\n " & s + var paramnames = newSeq[string](0) + for t in paramInfo: + paramnames.add t.name + p(t) + for t in paramInfoCTRT: + if t.name notin paramnames: # FIXME: should check location too + p(t) template cnvnone(x:typed):untyped = x template makeTypeParam(name,typ,deflt,cnvrt: untyped): untyped {.dirty.} = @@ -375,15 +390,15 @@ template processSaveParams*(index = -1) = let ii = instantiationInfo(index, fullPaths=true) saveParams(paramInfo, saveValue, ii.filename & ":" & $ii.line, ii.filename) +var makeParamInfoCTRT: proc() template processMakeParamInfoCTRT* = - proc makeParamInfoCTRT() {.inject.} - makeParamInfoCTRT() + if not isNil makeParamInfoCTRT: makeParamInfoCTRT() template installStandardParams*(idx= -3) = installSaveParams(index=idx) installLoadParams(index=idx) installHelpParam(index=idx) - processMakeParamInfoCTRT() + #processMakeParamInfoCTRT() # write param file at compile time macro writeParamFileX*(filename: static string, ii: static IiType) = @@ -405,10 +420,14 @@ macro makeParamInfoCTRTX*(): auto = let l = newLit t.line result.add quote do: paramInfoCTRT.add ParamObj(name:`n`,value:`v`,comment:`c`,file:`f`,line:`l`) +template finalizeParams* = + proc makeParamInfoCTRTImpl(): bool = + makeParamInfoCTRTX() + true + #makeParamInfoCTRT = makeParamInfoCTRTImpl + let dum {.global.} = makeParamInfoCTRTImpl() template writeParamFile*(filename: static string = "") = writeParamFileX(filename, instantiationInfo(fullPaths=true)) - proc makeParamInfoCTRT() {.inject.} = - makeParamInfoCTRTX() template assertParam*(p:auto, f:auto) = if not f p: @@ -439,8 +458,10 @@ template CLIset*(p:typed, n:untyped, prefix = "") = discard when isMainModule: - import qex, paramtest + import qex qexInit() + installHelpParam("h") + processHelpParam() letParam: bf = false @@ -462,11 +483,15 @@ when isMainModule: fa1 = @[1.0,1,1,1] fax = if true: @[2.0,2,2,2] else: @[3.0,3,3,3] - installHelpParam("h") echoParams() defaultSetup() + proc paramTest* = + var i = intParam("i", 0, "Test intParam") + var f = floatParam("f", 0.0, "Test floatParam") + echo i, f paramTest() writeParamFile("") qexFinalize() + finalizeParams() diff --git a/src/base/stdUtils.nim b/src/base/stdUtils.nim index 5da5f26..85bc4f3 100644 --- a/src/base/stdUtils.nim +++ b/src/base/stdUtils.nim @@ -56,6 +56,13 @@ proc `|`*(f: float, d: int): string = template `|-`*(x:SomeNumber, y: int): auto = x | -y +proc values*(e: typedesc[enum]): string = + result = "" + var sep = "" + for v in e: + result &= sep & $v + sep = " " + proc indexOf*[T](x: openArray[T], y: auto): int = let n = x.len while result Date: Thu, 5 Dec 2024 18:14:05 -0600 Subject: [PATCH 20/23] added fat7 derivative --- src/gauge/fat7l.nim | 5 + src/gauge/fat7lderiv.nim | 345 +++++++++++++++++++++++++++++++++++++++ src/gauge/hypsmear.nim | 37 +---- src/gauge/smearutil.nim | 50 ++++++ 4 files changed, 401 insertions(+), 36 deletions(-) create mode 100644 src/gauge/fat7lderiv.nim create mode 100644 src/gauge/smearutil.nim diff --git a/src/gauge/fat7l.nim b/src/gauge/fat7l.nim index 2aae356..9fdaea1 100644 --- a/src/gauge/fat7l.nim +++ b/src/gauge/fat7l.nim @@ -13,6 +13,11 @@ type Fat7lCoefs* = object sevenStaple*: float lepage*: float +#type Fat7lState*[I,O] = object +# coefs*: Fat7lCoefs +# in*: I +# out*: O + proc `$`*(c: Fat7lCoefs): string = result = "oneLink: " & $c.oneLink & "\n" result &= "threeStaple: " & $c.threeStaple & "\n" diff --git a/src/gauge/fat7lderiv.nim b/src/gauge/fat7lderiv.nim new file mode 100644 index 0000000..f77b309 --- /dev/null +++ b/src/gauge/fat7lderiv.nim @@ -0,0 +1,345 @@ +import fat7l, smearutil +import base/profile, layout + +let STAPLE_FLOPS = 3*198+216+18 +let SIDE_FORCE_FLOPS = 7*198+3*216+3*18 + +#[ +proc staple(auto out, auto in0, auto link0, int mu, int nu) = +#define link ftmp0[0] +#define linkmu ftmp[0][mu] +#define in ftmp0[1] +#define innu ftmp[1][nu] +#define linkin mtmp[0] +#define back btmp0[0] +#define backnu btmp[0][nu] +#define linkinnu mtmp[1] + + QDP_M_eq_M(link, link0, QDP_all); + QDP_M_eq_sM(linkmu, link, QDP_neighbor[mu], QDP_forward, QDP_all); + QDP_M_eq_M(in, in0, QDP_all); + QDP_M_eq_sM(innu, in, QDP_neighbor[nu], QDP_forward, QDP_all); + QDP_M_eq_Ma_times_M(linkin, link, in, QDP_all); + QDP_M_eq_M_times_M(back, linkin, linkmu, QDP_all); + QDP_M_eq_sM(backnu, back, QDP_neighbor[nu], QDP_backward, QDP_all); + QDP_M_eq_M_times_M(linkinnu, link, innu, QDP_all); + QDP_discard_M(innu); + QDP_M_peq_M_times_Ma(out, linkinnu, linkmu, QDP_all); + QDP_discard_M(linkmu); + QDP_M_peq_M(out, backnu, QDP_all); + QDP_discard_M(backnu); +#define STAPLE_FLOPS (3*198+216+18) + +#undef link +#undef linkmu +#undef in +#undef innu +#undef linkin +#undef back +#undef backnu +#undef linkinnu +} +]# + +#[ +static void +side_force(QDP_ColorMatrix *force, QDP_ColorMatrix *bot0, QDP_ColorMatrix *side0, + QDP_ColorMatrix *top0, int mu, int nu, QDP_ColorMatrix *stpl) +{ +#define side ftmp0[0] +#define sidemu ftmp[0][mu] +#define top ftmp0[1] +#define topnu ftmp[1][nu] +#define bot ftmp0[2] +#define botnu ftmp[2][nu] +#define sidebot mtmp[0] +#define sidetop mtmp[1] +#define topnusidebot btmp0[0] +#define fbmu btmp[0][mu] +#define botnusidetop btmp0[1] +#define fmbmu btmp[1][mu] +#define sidebotsidemu btmp0[2] +#define stm btmp[2][nu] +#define botnusidemu mtmp[2] +#define botsidemu mtmp[3] + + // force += bot * sidemu * topnu+ + // force -= bot-mu+ * side-mu * topnu-mu + // -= top <-> bot + // stpl += side * botnu * sidemu+ + // stpl += side-nu+ * bot-nu * sidemu-nu + QDP_M_eq_M(side, side0, QDP_all); + QDP_M_eq_sM(sidemu, side, QDP_neighbor[mu], QDP_forward, QDP_all); + QDP_M_eq_M(top, top0, QDP_all); + QDP_M_eq_sM(topnu, top, QDP_neighbor[nu], QDP_forward, QDP_all); + QDP_M_eq_M(bot, bot0, QDP_all); + QDP_M_eq_sM(botnu, bot, QDP_neighbor[nu], QDP_forward, QDP_all); + QDP_M_eq_Ma_times_M(sidebot, side, bot, QDP_all); + QDP_M_eq_Ma_times_M(sidetop, side, top, QDP_all); + QDP_M_eq_Ma_times_M(topnusidebot, topnu, sidebot, QDP_all); + QDP_M_eq_sM(fbmu, topnusidebot, QDP_neighbor[mu], QDP_backward, QDP_all); + QDP_M_eq_Ma_times_M(botnusidetop, botnu, sidetop, QDP_all); + QDP_M_eq_sM(fmbmu, botnusidetop, QDP_neighbor[mu], QDP_backward, QDP_all); + QDP_M_eq_M_times_M(sidebotsidemu, sidebot, sidemu, QDP_all); + QDP_M_eq_sM(stm, sidebotsidemu, QDP_neighbor[nu], QDP_backward, QDP_all); + QDP_M_eq_M_times_Ma(botnusidemu, botnu, sidemu, QDP_all); + QDP_discard_M(botnu); + QDP_M_peq_M_times_M(stpl, side, botnusidemu, QDP_all); + //QDP_M_meq_M_times_Ma(force, top, botnusidemu, QDP_all); + QDP_M_peq_M_times_Ma(force, top, botnusidemu, QDP_all); + QDP_M_eq_M_times_M(botsidemu, bot, sidemu, QDP_all); + QDP_discard_M(sidemu); + QDP_M_peq_M_times_Ma(force, botsidemu, topnu, QDP_all); + QDP_discard_M(topnu); + //QDP_M_meq_Ma(force, fbmu, QDP_all); + QDP_M_peq_Ma(force, fbmu, QDP_all); + QDP_discard_M(fbmu); + QDP_M_peq_Ma(force, fmbmu, QDP_all); + QDP_discard_M(fmbmu); + QDP_M_peq_M(stpl, stm, QDP_all); + QDP_discard_M(stm); +#define SIDE_FORCE_FLOPS (7*198+3*216+3*18) + +#undef side +#undef sidemu +#undef top +#undef topnu +#undef bot +#undef botnu +#undef sidebot +#undef sidetop +#undef topnusidebot +#undef fbmu +#undef botnusidetop +#undef fmbmu +#undef sidebotsidemu +#undef stm +#undef botnusidemu +#undef botsidemu +} +]# + +proc fat7lderiv(perf: var PerfInfo, gauge: auto, deriv: auto, + coef: Fat7lCoefs, mid: auto) = + tic() + var nflops = 0 + let coefL = coef.lepage + let coef1 = coef.oneLink - 6*coefL + let coef3 = coef.threeStaple + let coef5 = coef.fiveStaple + let coef7 = coef.sevenStaple + let have5 = (coef5!=0.0) or (coef7!=0.0) or (coefL!=0.0) + let have3 = (coef3!=0.0) or have5 + type lcm = type(gauge[0]) + let lo = gauge[0].l + proc newlcm: lcm = result.new(gauge[0].l) + var + stpl3: array[4,lcm] + mid5: array[4,lcm] + stpl5,mid3: lcm + s1: array[4,array[4,Shifter[lcm,type(gauge[0][0])]]] + s1a,s1b: array[4,Shifter[lcm,type(gauge[0][0])]] + tm1,tm2: lcm + sm1: array[4,Shifter[lcm,type(gauge[0][0])]] + if have3: + if have5: + for mu in 0..3: + stpl3[mu] = newlcm() + mid5[mu] = newlcm() + s1b[mu] = newShifter(gauge[0], mu, 1) + stpl5 = newlcm() + mid3 = newlcm() + for mu in 0..3: + for nu in 0..3: + if nu!=mu: + s1[mu][nu] = newShifter(gauge[mu], nu, 1) + s1a[mu] = newShifter(gauge[0], mu, 1) + sm1[mu] = newShifter(gauge[0], mu, -1) + tm1 = newlcm() + tm2 = newlcm() + threads: + for mu in 0..<4: + for nu in 0..<4: + if nu!=mu: + discard s1[mu][nu] ^* gauge[mu] + for sig in 0..3: + if have5: + for mu in 0..3: + if mu==sig: continue + #QDP_M_eq_zero(stpl3[mu], QDP_all); + stpl3[mu] := 0 + #staple(stpl3[mu], gauge[sig], gauge[mu], sig, mu); + symStaple(stpl3[mu], 1.0, gauge[mu], gauge[sig], + s1[mu][sig], s1[sig][mu], tm1, sm1[mu]) + #QDP_M_eq_zero(mid5[mu], QDP_all); + mid5[mu] := 0 + nflops += STAPLE_FLOPS + for rho in 0..3: + if rho==sig: continue + #QDP_M_eq_zero(stpl5, QDP_all); + stpl5 := 0 + if coef7!=0.0: + for mu in 0..3: + if mu==sig or mu==rho: continue + for nu in 0..3: + if nu==sig or nu==rho or nu==mu: continue + #staple(stpl5, stpl3[mu], gauge[nu], sig, nu); + discard s1a[nu] ^* stpl3[mu] + symStaple(stpl5, 1.0, gauge[nu], stpl3[mu], + s1[nu][sig], s1a[nu], tm1, sm1[nu]) + nflops += STAPLE_FLOPS + #QDP_M_eq_r_times_M(stpl5, &coef7, stpl5, QDP_all); + stpl5 *= coef7 + nflops += 18 + #QDP_M_eq_zero(mid3, QDP_all); + mid3 := 0 + if coefL!=0.0: + #QDP_M_peq_r_times_M(stpl5, &coefL, stpl3[rho], QDP_all); + stpl5 += coefL * stpl3[rho] + nflops += 36 + if coefL!=0.0 or coef7!=0.0: + #side_force(deriv[rho], mid[sig], gauge[rho], stpl5, sig, rho, mid3); + discard s1a[rho] ^* stpl5 + discard s1b[rho] ^* mid[sig] + symStapleDeriv(deriv[rho], mid3, + gauge[rho], stpl5, s1[rho][sig], s1a[rho], + mid[sig], s1b[rho], tm1, tm2, sm1[rho], sm1[sig]) + nflops += SIDE_FORCE_FLOPS + if coefL!=0.0: + #QDP_M_peq_r_times_M(mid5[rho], &coefL, mid3, QDP_all); + mid5[rho] += coefL * mid3 + nflops += 36 + #//QDP_M_eqm_r_times_M(mid3, &coef7, mid3, QDP_all); + #QDP_M_eq_r_times_M(mid3, &coef7, mid3, QDP_all); + mid3 *= coef7 + #QDP_M_peq_r_times_M(mid3, &coef5, mid[sig], QDP_all); + mid3 += coef5 * mid[sig] + nflops += 18+36 + for mu in 0..3: + if mu==sig or mu==rho: continue + for nu in 0..3: + if nu==sig or nu==rho or nu==mu: continue + #side_force(deriv[mu], mid3, gauge[mu], stpl3[nu], sig, mu, mid5[nu]); + discard s1a[mu] ^* stpl3[nu] + discard s1b[mu] ^* mid3 + symStapleDeriv(deriv[mu], mid5[nu], + gauge[mu], stpl3[nu], s1[mu][sig], s1a[mu], + mid3, s1b[mu], tm1, tm2, sm1[mu], sm1[sig]) + nflops += SIDE_FORCE_FLOPS + + if have3: + #QDP_M_eq_zero(mid3, QDP_all); + mid3 := 0 + for mu in 0..3: + if mu==sig: continue + if have5: + #//QDP_M_eq_r_times_M_minus_M(stpl5, &coef3, mid[sig], mid5[mu], QDP_all); + #QDP_M_eq_r_times_M_plus_M(stpl5, &coef3, mid[sig], mid5[mu], QDP_all); + stpl5 := coef3*mid[sig] + mid5[mu] + nflops += 36 + else: + #QDP_M_eq_r_times_M(stpl5, &coef3, mid[sig], QDP_all); + stpl5 := coef3*mid[sig] + nflops += 18 + #side_force(deriv[mu], stpl5, gauge[mu], gauge[sig], sig, mu, mid3); + discard s1a[mu] ^* stpl5 + symStapleDeriv(deriv[mu], mid3, + gauge[mu], gauge[sig], s1[mu][sig], s1[sig][mu], + stpl5, s1a[mu], tm1, tm2, sm1[mu], sm1[sig]) + nflops += SIDE_FORCE_FLOPS + #//QDP_M_meq_M(deriv[sig], mid3, QDP_all); + #QDP_M_peq_M(deriv[sig], mid3, QDP_all); + deriv[sig] += mid3 + nflops += 18 + if coef1!=0.0: + #QDP_M_peq_r_times_M(deriv[sig], &coef1, mid[sig], QDP_all); + deriv[sig] += coef1 * mid[sig] + nflops += 36 + + if coefL!=0.0: + # fix up Lepage term + let fixL = -coefL + for mu in 0..3: + #QDP_M_eq_zero(ftmp0[0], QDP_all); + tm1 := 0 + for nu in 0..3: + if nu==mu: continue + #QDP_M_eq_Ma_times_M(btmp0[0], mid[nu], gauge[nu], QDP_all); + tm2 := mid[nu].adj * gauge[nu] + #QDP_M_eq_sM(btmp[0][nu], btmp0[0], QDP_neighbor[nu], QDP_backward, QDP_all); + discard sm1[nu] ^* tm2 + #QDP_M_eq_M_times_Ma(stpl5, mid[nu], gauge[nu], QDP_all); + stpl5 := mid[nu] * gauge[nu].adj + #QDP_M_meq_M(stpl5, btmp[0][nu], QDP_all); + stpl5 -= sm1[nu].field + #QDP_discard_M(btmp[0][nu]); + #QDP_M_peq_M(ftmp0[0], stpl5, QDP_all); + #QDP_M_peq_Ma(ftmp0[0], stpl5, QDP_all); + tm1 += stpl5 + stpl5.adj + #QDP_M_eq_sM(ftmp[0][mu], ftmp0[0], QDP_neighbor[mu], QDP_forward, QDP_all); + discard s1a[mu] ^* tm1 + #QDP_M_eq_M_times_M(stpl5, ftmp0[0], gauge[mu], QDP_all); + stpl5 := tm1 * gauge[mu] + #QDP_M_meq_M_times_M(stpl5, gauge[mu], ftmp[0][mu], QDP_all); + stpl5 -= gauge[mu] * s1a[mu].field + #QDP_discard_M(ftmp[0][mu]); + #QDP_M_peq_r_times_M(deriv[mu], &fixL, stpl5, QDP_all); + deriv[mu] += fixL * stpl5 + nflops += 4*(3*(2*198+3*18)+198+216+36) + + #info->final_sec = QOP_time() - dtime; + #info->final_flop = nflops*QDP_sites_on_node; + #info->status = QOP_SUCCESS; + +when isMainModule: + import qex + import physics/qcdTypes + import gauge + qexInit() + #var defaultGaugeFile = "l88.scidac" + let defaultLat = @[8,8,8,8] + defaultSetup() + for mu in 0.. Date: Fri, 6 Dec 2024 10:40:26 -0600 Subject: [PATCH 21/23] move PerfInfo --- src/base/profile.nim | 16 ++++++++++++++++ src/gauge/fat7l.nim | 10 +++++----- src/gauge/fat7lderiv.nim | 8 +++++--- 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/src/base/profile.nim b/src/base/profile.nim index de7ca62..455e399 100644 --- a/src/base/profile.nim +++ b/src/base/profile.nim @@ -7,6 +7,22 @@ getOptimPragmas() const noTicToc {.boolDefine.} = false +type PerfInfo* = object + count*: int + flops*: float + secs*: float +template clear*(pi: var PerfInfo) = + pi.count = 0.0 + pi.flops = 0.0 + pi.secs = 0.0 +template `+=`*(r: var PerfInfo, x: PerfInfo) = + r.count += x.count + r.flops += x.flops + r.secs += x.secs +proc `$`*(pi: PerfInfo): string = + result = system.`$`(pi) + result &= &" {1e-9*pi.flops/pi.secs:.2f} Gflops" + type TicType* = distinct int64 template getTics*: TicType = TicType(getMonoTime().ticks) template nsec*(t:TicType):int64 = int64(t) diff --git a/src/gauge/fat7l.nim b/src/gauge/fat7l.nim index 9fdaea1..1a11024 100644 --- a/src/gauge/fat7l.nim +++ b/src/gauge/fat7l.nim @@ -1,10 +1,6 @@ import base import layout -#import gauge -#import strUtils - -type PerfInfo* = object - flops*: float +export PerfInfo type Fat7lCoefs* = object oneLink*: float @@ -168,6 +164,10 @@ proc makeImpLinks*(info: var PerfInfo, fl: auto, gf: auto, coef: auto, #info.final_flop = nflop*QDP_sites_on_node; #info.status = QOP_SUCCESS; toc() + inc info.count + info.flops += nflop * gf[0].l.localGeom.prod + info.secs += getElapsedTime() + proc makeImpLinks*(info: var PerfInfo, fl: auto, gf: auto, coef: auto) = makeImpLinks(info, fl, gf, coef, fl, gf, 0.0) diff --git a/src/gauge/fat7lderiv.nim b/src/gauge/fat7lderiv.nim index f77b309..a579e4f 100644 --- a/src/gauge/fat7lderiv.nim +++ b/src/gauge/fat7lderiv.nim @@ -292,9 +292,8 @@ proc fat7lderiv(perf: var PerfInfo, gauge: auto, deriv: auto, #info->status = QOP_SUCCESS; when isMainModule: - import qex - import physics/qcdTypes - import gauge + import qex, physics/qcdTypes + import strformat qexInit() #var defaultGaugeFile = "l88.scidac" let defaultLat = @[8,8,8,8] @@ -324,7 +323,9 @@ when isMainModule: echo g.plaq echo g2.plaq makeImpLinks(info, fl, g, coef) + echo info makeImpLinks(info, fl2, g2, coef) + echo info echo fl.plaq echo fl2.plaq #echo ll.plaq @@ -335,6 +336,7 @@ when isMainModule: #echo pow(1.0+6.0+6.0*4.0+6.0*4.0*2.0+6.0,4)/6.0 fat7lderiv(info, g, fd, coef, dg) + echo info for mu in 0..3: dfl[mu] := fl2[mu] - fl[mu] echo dfl[mu].norm2 From 5b97195b7022ca220b85dc3281d4eb16d36588ec Mon Sep 17 00:00:00 2001 From: James Osborn Date: Fri, 6 Dec 2024 12:50:32 -0600 Subject: [PATCH 22/23] add naik derivative --- src/gauge/fat7l.nim | 13 ++++--- src/gauge/fat7lderiv.nim | 74 ++++++++++++++++++++++++++++++++++--- src/physics/asqtadLinks.nim | 2 +- src/physics/hisqLinks.nim | 4 +- 4 files changed, 78 insertions(+), 15 deletions(-) diff --git a/src/gauge/fat7l.nim b/src/gauge/fat7l.nim index 1a11024..49216b4 100644 --- a/src/gauge/fat7l.nim +++ b/src/gauge/fat7l.nim @@ -68,8 +68,9 @@ proc computeGenStaple(staple: auto, mu,nu: int, link: auto, coef: float, #if(link!=gauge[mu]) QDP_discard_M(ts0); #QDP_discard_M(ts2); -proc makeImpLinks*(info: var PerfInfo, fl: auto, gf: auto, coef: auto, - ll: auto, gfLong: auto, naik: auto) = +proc makeImpLinks*(fl: auto, gf: auto, coef: auto, + ll: auto, gfLong: auto, naik: auto, + info: var PerfInfo) = tic() type lcm = type(gf[0]) proc QDP_create_M(): lcm = result.new(gf[0].l) @@ -168,8 +169,8 @@ proc makeImpLinks*(info: var PerfInfo, fl: auto, gf: auto, coef: auto, info.flops += nflop * gf[0].l.localGeom.prod info.secs += getElapsedTime() -proc makeImpLinks*(info: var PerfInfo, fl: auto, gf: auto, coef: auto) = - makeImpLinks(info, fl, gf, coef, fl, gf, 0.0) +proc makeImpLinks*(fl: auto, gf: auto, coef: auto, info: var PerfInfo) = + makeImpLinks(fl, gf, coef, fl, gf, 0.0, info) when isMainModule: import qex @@ -196,8 +197,8 @@ when isMainModule: var g3 = g echo g.plaq - makeImpLinks(info, fl, g, coef) - makeImpLinks(info, fl, g, coef, ll, g3, naik) + makeImpLinks(fl, g, coef, info) + makeImpLinks(fl, g, coef, ll, g3, naik, info) echo fl.plaq echo ll.plaq echo pow(1.0,4)/6.0 diff --git a/src/gauge/fat7lderiv.nim b/src/gauge/fat7lderiv.nim index a579e4f..08edd85 100644 --- a/src/gauge/fat7lderiv.nim +++ b/src/gauge/fat7lderiv.nim @@ -119,8 +119,9 @@ side_force(QDP_ColorMatrix *force, QDP_ColorMatrix *bot0, QDP_ColorMatrix *side0 } ]# -proc fat7lderiv(perf: var PerfInfo, gauge: auto, deriv: auto, - coef: Fat7lCoefs, mid: auto) = +proc fat7lDeriv(deriv: auto, gauge: auto, mid: auto, coef: Fat7lCoefs, + llderiv: auto, llgauge: auto, llmid: auto, naik: float, + perf: var PerfInfo) = tic() var nflops = 0 let coefL = coef.lepage @@ -287,9 +288,49 @@ proc fat7lderiv(perf: var PerfInfo, gauge: auto, deriv: auto, deriv[mu] += fixL * stpl5 nflops += 4*(3*(2*198+3*18)+198+216+36) + if naik!=0.0: + for mu in 0..3: + # force += mid * Umumu' * Umu' + # force -= U-mu' * mid-mu * Umu' + # force += U-mu' * U-mu-mu' * mid-mu-mu + #QDP_M_eq_M(Uf, U, QDP_all); + #QDP_M_eq_sM(Umu, Uf, QDP_neighbor[mu], QDP_forward, QDP_all); + discard s1a[mu] ^* llgauge[mu] + #QDP_M_eq_Ma_times_M(Umid, Uf, mid, QDP_all); + tm1 := llgauge[mu].adj * llmid[mu] + #QDP_M_eq_sM(Umidbmu, Umid, QDP_neighbor[mu], QDP_backward, QDP_all); + discard sm1[mu] ^* tm1 + #QDP_M_eq_Ma_times_Ma(UmuU, Umu, Uf, QDP_all); + tm2 := s1a[mu].field.adj * llgauge[mu].adj + #QDP_M_eq_sM(UmuUs, UmuU, QDP_neighbor[mu], QDP_forward, QDP_all); + discard s1b[mu] ^* tm2 + #QDP_M_eq_Ma_times_M(f3b, Uf, Umidbmu, QDP_all); + llderiv[mu] += naik*(sm1[mu].field*s1a[mu].field.adj) + tm1 := llgauge[mu].adj * sm1[mu].field + #QDP_M_eq_sM(f3, f3b, QDP_neighbor[mu], QDP_backward, QDP_all); + discard sm1[mu] ^* tm1 + #QDP_M_eq_M_times_M(f, mid, UmuUs, QDP_all); + #QDP_discard_M(UmuUs); + #QDP_M_peq_M_times_Ma(f, Umidbmu, Umu, QDP_all); + #QDP_discard_M(Umidbmu); + #QDP_discard_M(Umu); + #QDP_M_peq_M(f, f3, QDP_all); + #QDP_discard_M(f3); + #QDP_M_peq_r_times_M(deriv[mu], &coefN, f, QDP_all); + llderiv[mu] += naik*(llmid[mu]*s1b[mu].field+sm1[mu].field) + #info->final_sec = QOP_time() - dtime; #info->final_flop = nflops*QDP_sites_on_node; #info->status = QOP_SUCCESS; + toc() + inc perf.count + perf.flops += nflops * gauge[0].l.localGeom.prod + perf.secs += getElapsedTime() + +proc fat7lDeriv(deriv: auto, gauge: auto, mid: auto, coef: Fat7lCoefs, + perf: var PerfInfo) = + fat7lDeriv(deriv, gauge, mid, coef, deriv, gauge, mid, 0.0, perf) + when isMainModule: import qex, physics/qcdTypes @@ -308,13 +349,17 @@ when isMainModule: coef.fiveStaple = 1.0 coef.sevenStaple = 1.0 coef.lepage = 1.0 + var naik = 1.0 var fl = lo.newGauge() var fl2 = lo.newGauge() + var ll = lo.newGauge() + var ll2 = lo.newGauge() var dfl = lo.newGauge() var g2 = lo.newGauge() var dg = lo.newGauge() var fd = lo.newGauge() + var ld = lo.newGauge() for mu in 0.. Date: Fri, 6 Dec 2024 15:58:33 -0600 Subject: [PATCH 23/23] add threading to fat7 --- src/base/profile.nim | 2 +- src/base/threading.nim | 12 ++ src/gauge/fat7l.nim | 95 ++++++------ src/gauge/fat7lderiv.nim | 311 +++++++++++++++++++-------------------- 4 files changed, 210 insertions(+), 210 deletions(-) diff --git a/src/base/profile.nim b/src/base/profile.nim index 455e399..56b665e 100644 --- a/src/base/profile.nim +++ b/src/base/profile.nim @@ -12,7 +12,7 @@ type PerfInfo* = object flops*: float secs*: float template clear*(pi: var PerfInfo) = - pi.count = 0.0 + pi.count = 0 pi.flops = 0.0 pi.secs = 0.0 template `+=`*(r: var PerfInfo, x: PerfInfo) = diff --git a/src/base/threading.nim b/src/base/threading.nim index 5868975..1e5faba 100644 --- a/src/base/threading.nim +++ b/src/base/threading.nim @@ -460,6 +460,18 @@ macro threadSum2*(a:varargs[untyped]):auto = result.add(m) #echo result.treeRepr +type ThreadSingle*[T] = distinct T +template `[]`*[T](x: ThreadSingle[T]): auto = T(x) +template `[]=`*[T](x: ThreadSingle[T], y: auto): auto = + T(x) = y +template `=`*(x: var ThreadSingle, y: auto) = + threadSingle: + x[] = y +template `+=`*(x: var ThreadSingle, y: auto) = + threadSingle: + x[] += y +converter fromThreadSingle*[T](x: ThreadSingle[T]): T = T(x) + when isMainModule: threadsInit() echo threadNum, "/", numThreads diff --git a/src/gauge/fat7l.nim b/src/gauge/fat7l.nim index 49216b4..c845968 100644 --- a/src/gauge/fat7l.nim +++ b/src/gauge/fat7l.nim @@ -71,7 +71,7 @@ proc computeGenStaple(staple: auto, mu,nu: int, link: auto, coef: float, proc makeImpLinks*(fl: auto, gf: auto, coef: auto, ll: auto, gfLong: auto, naik: auto, info: var PerfInfo) = - tic() + tic("makeImpLinks") type lcm = type(gf[0]) proc QDP_create_M(): lcm = result.new(gf[0].l) var @@ -111,60 +111,49 @@ proc makeImpLinks*(fl: auto, gf: auto, coef: auto, for nu in 0..<4: if dir!=nu: tsg[dir][nu] = newShifter(gf[dir], nu, 1) - discard tsg[dir][nu] ^* gf[dir] - - for dir in 0..<4: - #QDP_M_eq_r_times_M(fl[dir], &coef1, gf[dir], QDP_all); - fl[dir] := coef1 * gf[dir] - if have3: - for nu in 0..<4: - if nu!=dir: - compute_gen_staple(staple, dir, nu, gf[dir], coef3, gf, fl, - tsg[dir][nu], tsg[nu][dir], t1, t2, ts2[nu]) - if coefL!=0.0: - compute_gen_staple(false, dir, nu, staple, coefL, gf, fl, - tsl[nu], tsg[nu][dir], t1, t2, ts2[nu]) - if coef5!=0.0 or coef7!=0.0: - for rho in 0..<4: - if (rho!=dir) and (rho!=nu): - compute_gen_staple(tempmat1, dir, rho, staple, coef5, gf, fl, - tsl[rho], tsg[rho][dir], t1, t2, ts2[rho]) - if coef7!=0.0: - for sig in 0..<4: - if (sig!=dir) and (sig!=nu) and (sig!=rho): - compute_gen_staple(false, dir, sig, tempmat1,coef7,gf,fl, - ts1[sig],tsg[sig][dir],t1,t2,ts2[sig]) - - # long links - if naik!=0.0: + threads: + for dir in 0..<4: + for nu in 0..<4: + if dir!=nu: + discard tsg[dir][nu] ^*! gf[dir] + + toc("main loop") + threads: for dir in 0..<4: - #QDP_M_eq_sM(staple, gfLong[dir], QDP_neighbor[dir], QDP_forward,QDP_all) - #QDP_M_eq_M_times_M(tempmat1, gfLong[dir], staple, QDP_all) - #QDP_M_eq_sM(staple, tempmat1, QDP_neighbor[dir], QDP_forward, QDP_all) - #QDP_M_eq_M_times_M(ll[dir], gfLong[dir], staple, QDP_all) - #QDP_M_eq_r_times_M(ll[dir], &naik, ll[dir], QDP_all) - discard tsl[dir] ^* gfLong[dir] - discard ts1[dir] ^* (gfLong[dir] * tsl[dir].field) - ll[dir] := naik * (gfLong[dir] * ts1[dir].field) - - #[ - if have3 or naik!=0.0: - QDP_destroy_M(staple); - QDP_destroy_M(tempmat1); - if have3: - QDP_destroy_M(t1); - QDP_destroy_M(t2); - for dir in 0..<4: - QDP_destroy_M(tsl[dir]); - QDP_destroy_M(ts1[dir]); - QDP_destroy_M(ts2[dir]); + #QDP_M_eq_r_times_M(fl[dir], &coef1, gf[dir], QDP_all); + fl[dir] := coef1 * gf[dir] + if have3: for nu in 0..<4: - if(tsg[dir][nu]!=nil) QDP_destroy_M(tsg[dir][nu]) - ]# - #info.final_sec = dtime; - #info.final_flop = nflop*QDP_sites_on_node; - #info.status = QOP_SUCCESS; - toc() + if nu!=dir: + compute_gen_staple(staple, dir, nu, gf[dir], coef3, gf, fl, + tsg[dir][nu], tsg[nu][dir], t1, t2, ts2[nu]) + if coefL!=0.0: + compute_gen_staple(false, dir, nu, staple, coefL, gf, fl, + tsl[nu], tsg[nu][dir], t1, t2, ts2[nu]) + if coef5!=0.0 or coef7!=0.0: + for rho in 0..<4: + if (rho!=dir) and (rho!=nu): + compute_gen_staple(tempmat1, dir, rho, staple, coef5, gf, fl, + tsl[rho], tsg[rho][dir], t1, t2, ts2[rho]) + if coef7!=0.0: + for sig in 0..<4: + if (sig!=dir) and (sig!=nu) and (sig!=rho): + compute_gen_staple(false, dir, sig, tempmat1,coef7,gf,fl, + ts1[sig],tsg[sig][dir],t1,t2,ts2[sig]) + + # long links + if naik!=0.0: + for dir in 0..<4: + #QDP_M_eq_sM(staple, gfLong[dir], QDP_neighbor[dir], QDP_forward,QDP_all) + #QDP_M_eq_M_times_M(tempmat1, gfLong[dir], staple, QDP_all) + #QDP_M_eq_sM(staple, tempmat1, QDP_neighbor[dir], QDP_forward, QDP_all) + #QDP_M_eq_M_times_M(ll[dir], gfLong[dir], staple, QDP_all) + #QDP_M_eq_r_times_M(ll[dir], &naik, ll[dir], QDP_all) + discard tsl[dir] ^* gfLong[dir] + discard ts1[dir] ^* (gfLong[dir] * tsl[dir].field) + ll[dir] := naik * (gfLong[dir] * ts1[dir].field) + + toc("end") inc info.count info.flops += nflop * gf[0].l.localGeom.prod info.secs += getElapsedTime() diff --git a/src/gauge/fat7lderiv.nim b/src/gauge/fat7lderiv.nim index 08edd85..5ba34fd 100644 --- a/src/gauge/fat7lderiv.nim +++ b/src/gauge/fat7lderiv.nim @@ -122,8 +122,8 @@ side_force(QDP_ColorMatrix *force, QDP_ColorMatrix *bot0, QDP_ColorMatrix *side0 proc fat7lDeriv(deriv: auto, gauge: auto, mid: auto, coef: Fat7lCoefs, llderiv: auto, llgauge: auto, llmid: auto, naik: float, perf: var PerfInfo) = - tic() - var nflops = 0 + tic("fat7lDeriv") + var nflops = ThreadSingle[int](0) let coefL = coef.lepage let coef1 = coef.oneLink - 6*coefL let coef3 = coef.threeStaple @@ -162,167 +162,168 @@ proc fat7lDeriv(deriv: auto, gauge: auto, mid: auto, coef: Fat7lCoefs, for mu in 0..<4: for nu in 0..<4: if nu!=mu: - discard s1[mu][nu] ^* gauge[mu] - for sig in 0..3: - if have5: - for mu in 0..3: - if mu==sig: continue - #QDP_M_eq_zero(stpl3[mu], QDP_all); - stpl3[mu] := 0 - #staple(stpl3[mu], gauge[sig], gauge[mu], sig, mu); - symStaple(stpl3[mu], 1.0, gauge[mu], gauge[sig], - s1[mu][sig], s1[sig][mu], tm1, sm1[mu]) - #QDP_M_eq_zero(mid5[mu], QDP_all); - mid5[mu] := 0 - nflops += STAPLE_FLOPS - for rho in 0..3: - if rho==sig: continue - #QDP_M_eq_zero(stpl5, QDP_all); - stpl5 := 0 - if coef7!=0.0: + discard s1[mu][nu] ^*! gauge[mu] + threads: + for sig in 0..3: + if have5: + for mu in 0..3: + if mu==sig: continue + #QDP_M_eq_zero(stpl3[mu], QDP_all); + stpl3[mu] := 0 + #staple(stpl3[mu], gauge[sig], gauge[mu], sig, mu); + symStaple(stpl3[mu], 1.0, gauge[mu], gauge[sig], + s1[mu][sig], s1[sig][mu], tm1, sm1[mu]) + #QDP_M_eq_zero(mid5[mu], QDP_all); + mid5[mu] := 0 + nflops += STAPLE_FLOPS + for rho in 0..3: + if rho==sig: continue + #QDP_M_eq_zero(stpl5, QDP_all); + stpl5 := 0 + if coef7!=0.0: + for mu in 0..3: + if mu==sig or mu==rho: continue + for nu in 0..3: + if nu==sig or nu==rho or nu==mu: continue + #staple(stpl5, stpl3[mu], gauge[nu], sig, nu); + discard s1a[nu] ^* stpl3[mu] + symStaple(stpl5, 1.0, gauge[nu], stpl3[mu], + s1[nu][sig], s1a[nu], tm1, sm1[nu]) + nflops += STAPLE_FLOPS + #QDP_M_eq_r_times_M(stpl5, &coef7, stpl5, QDP_all); + stpl5 *= coef7 + nflops += 18 + #QDP_M_eq_zero(mid3, QDP_all); + mid3 := 0 + if coefL!=0.0: + #QDP_M_peq_r_times_M(stpl5, &coefL, stpl3[rho], QDP_all); + stpl5 += coefL * stpl3[rho] + nflops += 36 + if coefL!=0.0 or coef7!=0.0: + #side_force(deriv[rho], mid[sig], gauge[rho], stpl5, sig, rho, mid3); + discard s1a[rho] ^* stpl5 + discard s1b[rho] ^* mid[sig] + symStapleDeriv(deriv[rho], mid3, + gauge[rho], stpl5, s1[rho][sig], s1a[rho], + mid[sig], s1b[rho], tm1, tm2, sm1[rho], sm1[sig]) + nflops += SIDE_FORCE_FLOPS + if coefL!=0.0: + #QDP_M_peq_r_times_M(mid5[rho], &coefL, mid3, QDP_all); + mid5[rho] += coefL * mid3 + nflops += 36 + #//QDP_M_eqm_r_times_M(mid3, &coef7, mid3, QDP_all); + #QDP_M_eq_r_times_M(mid3, &coef7, mid3, QDP_all); + mid3 *= coef7 + #QDP_M_peq_r_times_M(mid3, &coef5, mid[sig], QDP_all); + mid3 += coef5 * mid[sig] + nflops += 18+36 for mu in 0..3: - if mu==sig or mu==rho: continue + if mu==sig or mu==rho: continue for nu in 0..3: if nu==sig or nu==rho or nu==mu: continue - #staple(stpl5, stpl3[mu], gauge[nu], sig, nu); - discard s1a[nu] ^* stpl3[mu] - symStaple(stpl5, 1.0, gauge[nu], stpl3[mu], - s1[nu][sig], s1a[nu], tm1, sm1[nu]) - nflops += STAPLE_FLOPS - #QDP_M_eq_r_times_M(stpl5, &coef7, stpl5, QDP_all); - stpl5 *= coef7 - nflops += 18 + #side_force(deriv[mu], mid3, gauge[mu], stpl3[nu], sig, mu, mid5[nu]); + discard s1a[mu] ^* stpl3[nu] + discard s1b[mu] ^* mid3 + symStapleDeriv(deriv[mu], mid5[nu], + gauge[mu], stpl3[nu], s1[mu][sig], s1a[mu], + mid3, s1b[mu], tm1, tm2, sm1[mu], sm1[sig]) + nflops += SIDE_FORCE_FLOPS + + if have3: #QDP_M_eq_zero(mid3, QDP_all); mid3 := 0 - if coefL!=0.0: - #QDP_M_peq_r_times_M(stpl5, &coefL, stpl3[rho], QDP_all); - stpl5 += coefL * stpl3[rho] - nflops += 36 - if coefL!=0.0 or coef7!=0.0: - #side_force(deriv[rho], mid[sig], gauge[rho], stpl5, sig, rho, mid3); - discard s1a[rho] ^* stpl5 - discard s1b[rho] ^* mid[sig] - symStapleDeriv(deriv[rho], mid3, - gauge[rho], stpl5, s1[rho][sig], s1a[rho], - mid[sig], s1b[rho], tm1, tm2, sm1[rho], sm1[sig]) - nflops += SIDE_FORCE_FLOPS - if coefL!=0.0: - #QDP_M_peq_r_times_M(mid5[rho], &coefL, mid3, QDP_all); - mid5[rho] += coefL * mid3 - nflops += 36 - #//QDP_M_eqm_r_times_M(mid3, &coef7, mid3, QDP_all); - #QDP_M_eq_r_times_M(mid3, &coef7, mid3, QDP_all); - mid3 *= coef7 - #QDP_M_peq_r_times_M(mid3, &coef5, mid[sig], QDP_all); - mid3 += coef5 * mid[sig] - nflops += 18+36 for mu in 0..3: - if mu==sig or mu==rho: continue - for nu in 0..3: - if nu==sig or nu==rho or nu==mu: continue - #side_force(deriv[mu], mid3, gauge[mu], stpl3[nu], sig, mu, mid5[nu]); - discard s1a[mu] ^* stpl3[nu] - discard s1b[mu] ^* mid3 - symStapleDeriv(deriv[mu], mid5[nu], - gauge[mu], stpl3[nu], s1[mu][sig], s1a[mu], - mid3, s1b[mu], tm1, tm2, sm1[mu], sm1[sig]) - nflops += SIDE_FORCE_FLOPS + if mu==sig: continue + if have5: + #//QDP_M_eq_r_times_M_minus_M(stpl5, &coef3, mid[sig], mid5[mu], QDP_all); + #QDP_M_eq_r_times_M_plus_M(stpl5, &coef3, mid[sig], mid5[mu], QDP_all); + stpl5 := coef3*mid[sig] + mid5[mu] + nflops += 36 + else: + #QDP_M_eq_r_times_M(stpl5, &coef3, mid[sig], QDP_all); + stpl5 := coef3*mid[sig] + nflops += 18 + #side_force(deriv[mu], stpl5, gauge[mu], gauge[sig], sig, mu, mid3); + discard s1a[mu] ^* stpl5 + symStapleDeriv(deriv[mu], mid3, + gauge[mu], gauge[sig], s1[mu][sig], s1[sig][mu], + stpl5, s1a[mu], tm1, tm2, sm1[mu], sm1[sig]) + nflops += SIDE_FORCE_FLOPS + #//QDP_M_meq_M(deriv[sig], mid3, QDP_all); + #QDP_M_peq_M(deriv[sig], mid3, QDP_all); + deriv[sig] += mid3 + nflops += 18 + if coef1!=0.0: + #QDP_M_peq_r_times_M(deriv[sig], &coef1, mid[sig], QDP_all); + deriv[sig] += coef1 * mid[sig] + nflops += 36 - if have3: - #QDP_M_eq_zero(mid3, QDP_all); - mid3 := 0 + if coefL!=0.0: + # fix up Lepage term + let fixL = -coefL for mu in 0..3: - if mu==sig: continue - if have5: - #//QDP_M_eq_r_times_M_minus_M(stpl5, &coef3, mid[sig], mid5[mu], QDP_all); - #QDP_M_eq_r_times_M_plus_M(stpl5, &coef3, mid[sig], mid5[mu], QDP_all); - stpl5 := coef3*mid[sig] + mid5[mu] - nflops += 36 - else: - #QDP_M_eq_r_times_M(stpl5, &coef3, mid[sig], QDP_all); - stpl5 := coef3*mid[sig] - nflops += 18 - #side_force(deriv[mu], stpl5, gauge[mu], gauge[sig], sig, mu, mid3); - discard s1a[mu] ^* stpl5 - symStapleDeriv(deriv[mu], mid3, - gauge[mu], gauge[sig], s1[mu][sig], s1[sig][mu], - stpl5, s1a[mu], tm1, tm2, sm1[mu], sm1[sig]) - nflops += SIDE_FORCE_FLOPS - #//QDP_M_meq_M(deriv[sig], mid3, QDP_all); - #QDP_M_peq_M(deriv[sig], mid3, QDP_all); - deriv[sig] += mid3 - nflops += 18 - if coef1!=0.0: - #QDP_M_peq_r_times_M(deriv[sig], &coef1, mid[sig], QDP_all); - deriv[sig] += coef1 * mid[sig] - nflops += 36 - - if coefL!=0.0: - # fix up Lepage term - let fixL = -coefL - for mu in 0..3: - #QDP_M_eq_zero(ftmp0[0], QDP_all); - tm1 := 0 - for nu in 0..3: - if nu==mu: continue - #QDP_M_eq_Ma_times_M(btmp0[0], mid[nu], gauge[nu], QDP_all); - tm2 := mid[nu].adj * gauge[nu] - #QDP_M_eq_sM(btmp[0][nu], btmp0[0], QDP_neighbor[nu], QDP_backward, QDP_all); - discard sm1[nu] ^* tm2 - #QDP_M_eq_M_times_Ma(stpl5, mid[nu], gauge[nu], QDP_all); - stpl5 := mid[nu] * gauge[nu].adj - #QDP_M_meq_M(stpl5, btmp[0][nu], QDP_all); - stpl5 -= sm1[nu].field - #QDP_discard_M(btmp[0][nu]); - #QDP_M_peq_M(ftmp0[0], stpl5, QDP_all); - #QDP_M_peq_Ma(ftmp0[0], stpl5, QDP_all); - tm1 += stpl5 + stpl5.adj - #QDP_M_eq_sM(ftmp[0][mu], ftmp0[0], QDP_neighbor[mu], QDP_forward, QDP_all); - discard s1a[mu] ^* tm1 - #QDP_M_eq_M_times_M(stpl5, ftmp0[0], gauge[mu], QDP_all); - stpl5 := tm1 * gauge[mu] - #QDP_M_meq_M_times_M(stpl5, gauge[mu], ftmp[0][mu], QDP_all); - stpl5 -= gauge[mu] * s1a[mu].field - #QDP_discard_M(ftmp[0][mu]); - #QDP_M_peq_r_times_M(deriv[mu], &fixL, stpl5, QDP_all); - deriv[mu] += fixL * stpl5 - nflops += 4*(3*(2*198+3*18)+198+216+36) + #QDP_M_eq_zero(ftmp0[0], QDP_all); + tm1 := 0 + for nu in 0..3: + if nu==mu: continue + #QDP_M_eq_Ma_times_M(btmp0[0], mid[nu], gauge[nu], QDP_all); + tm2 := mid[nu].adj * gauge[nu] + #QDP_M_eq_sM(btmp[0][nu], btmp0[0], QDP_neighbor[nu], QDP_backward, QDP_all); + discard sm1[nu] ^* tm2 + #QDP_M_eq_M_times_Ma(stpl5, mid[nu], gauge[nu], QDP_all); + stpl5 := mid[nu] * gauge[nu].adj + #QDP_M_meq_M(stpl5, btmp[0][nu], QDP_all); + stpl5 -= sm1[nu].field + #QDP_discard_M(btmp[0][nu]); + #QDP_M_peq_M(ftmp0[0], stpl5, QDP_all); + #QDP_M_peq_Ma(ftmp0[0], stpl5, QDP_all); + tm1 += stpl5 + stpl5.adj + #QDP_M_eq_sM(ftmp[0][mu], ftmp0[0], QDP_neighbor[mu], QDP_forward, QDP_all); + discard s1a[mu] ^* tm1 + #QDP_M_eq_M_times_M(stpl5, ftmp0[0], gauge[mu], QDP_all); + stpl5 := tm1 * gauge[mu] + #QDP_M_meq_M_times_M(stpl5, gauge[mu], ftmp[0][mu], QDP_all); + stpl5 -= gauge[mu] * s1a[mu].field + #QDP_discard_M(ftmp[0][mu]); + #QDP_M_peq_r_times_M(deriv[mu], &fixL, stpl5, QDP_all); + deriv[mu] += fixL * stpl5 + nflops += 4*(3*(2*198+3*18)+198+216+36) - if naik!=0.0: - for mu in 0..3: - # force += mid * Umumu' * Umu' - # force -= U-mu' * mid-mu * Umu' - # force += U-mu' * U-mu-mu' * mid-mu-mu - #QDP_M_eq_M(Uf, U, QDP_all); - #QDP_M_eq_sM(Umu, Uf, QDP_neighbor[mu], QDP_forward, QDP_all); - discard s1a[mu] ^* llgauge[mu] - #QDP_M_eq_Ma_times_M(Umid, Uf, mid, QDP_all); - tm1 := llgauge[mu].adj * llmid[mu] - #QDP_M_eq_sM(Umidbmu, Umid, QDP_neighbor[mu], QDP_backward, QDP_all); - discard sm1[mu] ^* tm1 - #QDP_M_eq_Ma_times_Ma(UmuU, Umu, Uf, QDP_all); - tm2 := s1a[mu].field.adj * llgauge[mu].adj - #QDP_M_eq_sM(UmuUs, UmuU, QDP_neighbor[mu], QDP_forward, QDP_all); - discard s1b[mu] ^* tm2 - #QDP_M_eq_Ma_times_M(f3b, Uf, Umidbmu, QDP_all); - llderiv[mu] += naik*(sm1[mu].field*s1a[mu].field.adj) - tm1 := llgauge[mu].adj * sm1[mu].field - #QDP_M_eq_sM(f3, f3b, QDP_neighbor[mu], QDP_backward, QDP_all); - discard sm1[mu] ^* tm1 - #QDP_M_eq_M_times_M(f, mid, UmuUs, QDP_all); - #QDP_discard_M(UmuUs); - #QDP_M_peq_M_times_Ma(f, Umidbmu, Umu, QDP_all); - #QDP_discard_M(Umidbmu); - #QDP_discard_M(Umu); - #QDP_M_peq_M(f, f3, QDP_all); - #QDP_discard_M(f3); - #QDP_M_peq_r_times_M(deriv[mu], &coefN, f, QDP_all); - llderiv[mu] += naik*(llmid[mu]*s1b[mu].field+sm1[mu].field) + if naik!=0.0: + for mu in 0..3: + # force += mid * Umumu' * Umu' + # force -= U-mu' * mid-mu * Umu' + # force += U-mu' * U-mu-mu' * mid-mu-mu + #QDP_M_eq_M(Uf, U, QDP_all); + #QDP_M_eq_sM(Umu, Uf, QDP_neighbor[mu], QDP_forward, QDP_all); + discard s1a[mu] ^* llgauge[mu] + #QDP_M_eq_Ma_times_M(Umid, Uf, mid, QDP_all); + tm1 := llgauge[mu].adj * llmid[mu] + #QDP_M_eq_sM(Umidbmu, Umid, QDP_neighbor[mu], QDP_backward, QDP_all); + discard sm1[mu] ^* tm1 + #QDP_M_eq_Ma_times_Ma(UmuU, Umu, Uf, QDP_all); + tm2 := s1a[mu].field.adj * llgauge[mu].adj + #QDP_M_eq_sM(UmuUs, UmuU, QDP_neighbor[mu], QDP_forward, QDP_all); + discard s1b[mu] ^* tm2 + #QDP_M_eq_Ma_times_M(f3b, Uf, Umidbmu, QDP_all); + llderiv[mu] += naik*(sm1[mu].field*s1a[mu].field.adj) + tm1 := llgauge[mu].adj * sm1[mu].field + #QDP_M_eq_sM(f3, f3b, QDP_neighbor[mu], QDP_backward, QDP_all); + discard sm1[mu] ^* tm1 + #QDP_M_eq_M_times_M(f, mid, UmuUs, QDP_all); + #QDP_discard_M(UmuUs); + #QDP_M_peq_M_times_Ma(f, Umidbmu, Umu, QDP_all); + #QDP_discard_M(Umidbmu); + #QDP_discard_M(Umu); + #QDP_M_peq_M(f, f3, QDP_all); + #QDP_discard_M(f3); + #QDP_M_peq_r_times_M(deriv[mu], &coefN, f, QDP_all); + llderiv[mu] += naik*(llmid[mu]*s1b[mu].field+sm1[mu].field) #info->final_sec = QOP_time() - dtime; #info->final_flop = nflops*QDP_sites_on_node; #info->status = QOP_SUCCESS; - toc() + toc("end") inc perf.count perf.flops += nflops * gauge[0].l.localGeom.prod perf.secs += getElapsedTime() @@ -368,17 +369,14 @@ when isMainModule: echo g.plaq echo g2.plaq makeImpLinks(fl, g, coef, info) + info.clear + resetTimers() + makeImpLinks(fl, g, coef, info) echo info makeImpLinks(fl2, g2, coef, info) echo info echo fl.plaq echo fl2.plaq - #echo ll.plaq - #echo pow(1.0,4)/6.0 - #echo pow(1.0+6.0,4)/6.0 - #echo pow(1.0+6.0+6.0*4.0,4)/6.0 - #echo pow(1.0+6.0+6.0*4.0+6.0*4.0*2.0,4)/6.0 - #echo pow(1.0+6.0+6.0*4.0+6.0*4.0*2.0+6.0,4)/6.0 fat7lderiv(fd, g, dg, coef, info) echo info @@ -406,4 +404,5 @@ when isMainModule: dfl[mu] -= ld[mu] echo dfl[mu].norm2 + echoProf() qexFinalize()