diff --git a/investigations/flatLongTest/EulerArray.hpp b/investigations/flatLongTest/EulerArray.hpp new file mode 100644 index 0000000..e69de29 diff --git a/investigations/flatLongTest/EulerAtomic.hpp b/investigations/flatLongTest/EulerAtomic.hpp new file mode 100644 index 0000000..e69de29 diff --git a/investigations/flatLongTest/KSArray.hpp b/investigations/flatLongTest/KSArray.hpp new file mode 100644 index 0000000..7b5c2b5 --- /dev/null +++ b/investigations/flatLongTest/KSArray.hpp @@ -0,0 +1,342 @@ + + + +//Read in the data from the global right/left variables to the shared temper variable. +__device__ __forceinline__ +void readIn(double *temp, const double *rights, const double *lefts, int td, int gd) +{ + int leftidx = disc.ht + (((td>>2) & 1) * disc.base) + (td & 3) - (4 + ((td>>2)<<1)); + int rightidx = disc.ht + (((td>>2) & 1) * disc.base) + ((td>>2)<<1) + (td & 3); + + temp[leftidx] = rights[gd]; + temp[rightidx] = lefts[gd]; +} + +__device__ __forceinline__ +void writeOutRight(double *temp, double *rights, double *lefts, int td, int gd, int bd) +{ + int gdskew = (gd + bd) & disc.idxend; + int leftidx = (((td>>2) & 1) * disc.base) + ((td>>2)<<1) + (td & 3) + 2; + int rightidx = (disc.base - 6) + (((td>>2) & 1) * disc.base) + (td & 3) - ((td>>2)<<1); + rights[gdskew] = temp[rightidx]; + lefts[gd] = temp[leftidx]; +} + +__device__ __forceinline__ +void writeOutLeft(double *temp, double *rights, double *lefts, int td, int gd, int bd) +{ + int gdskew = (gd - bd) & disc.idxend; + int leftidx = (((td>>2) & 1) * disc.base) + ((td>>2)<<1) + (td & 3) + 2; + int rightidx = (disc.base-6) + (((td>>2) & 1) * disc.base) + (td & 3) - ((td>>2)<<1); + rights[gd] = temp[rightidx]; + lefts[gdskew] = temp[leftidx]; +} + +__device__ __forceinline__ +double stutterStep(const double *u, const int loc[5]) +{ + return u[loc[2]] - disc.dt_half * (convect(u[loc[1]], u[loc[3]]) + secondDer(u[loc[1]], u[loc[3]], u[loc[2]]) + + fourthDer(u[loc[0]], u[loc[1]], u[loc[2]], u[loc[3]], u[loc[4]])); +} + +__device__ +double finalStep(const double *u, const int loc[5]) +{ + return (-disc.dt * (convect(u[loc[1]], u[loc[3]]) + secondDer(u[loc[1]], u[loc[3]], u[loc[2]]) + + fourthDer(u[loc[0]], u[loc[1]], u[loc[2]], u[loc[3]], u[loc[4]]))); +} + +//Classic +template +__global__ void +classicKS(const double *ks_in, double *ks_out) +{ + auto gidz = idxes<5>(); + + if (FINAL) ks_out[gidz.a[2]] += finalStep(ks_in, gidz.a); + else ks_out[gidz.a[2]] = stutterStep(ks_in, gidz.a); +} + +__global__ +void +upTriangle(const double *IC, double *outRight, double *outLeft) +{ + extern __shared__ double temper[]; + + int gid = blockDim.x * blockIdx.x + threadIdx.x; //Global Thread ID + int tididx = threadIdx.x + 2; + + int step2; + + int tid_top[5], tid_bottom[5]; + #pragma unroll + for (int k = -2; k<3; k++) + { + tid_top[k+2] = tididx + k + disc.base; + tid_bottom[k+2] = tididx + k; + } + + //Assign the initial values to the first row in temper, each block + //has it's own version of temper shared among its threads. + temper[tididx] = IC[gid]; + + __syncthreads(); + + if (threadIdx.x > 1 && threadIdx.x <(blockDim.x-2)) + { + temper[tid_top[2]] = stutterStep(temper, tid_bottom); + } + + __syncthreads(); + + for (int k = 4; k<(blockDim.x/2); k+=4) + { + if (threadIdx.x < (blockDim.x-k) && threadIdx.x >= k) + { + temper[tididx] += finalStep(temper, tid_top); + } + + step2 = k + 2; + __syncthreads(); + + if (threadIdx.x < (blockDim.x-step2) && threadIdx.x >= step2) + { + temper[tid_top[2]] = stutterStep(temper, tid_bottom); + } + + //Make sure the threads are synced + __syncthreads(); + + } + + writeOutRight(temper, outRight, outLeft, threadIdx.x, gid, blockDim.x); + +} + +__global__ +void +downTriangle(double *IC, const double *inRight, const double *inLeft) +{ + extern __shared__ double temper[]; + + int gid = blockDim.x * blockIdx.x + threadIdx.x; + int tididx = threadIdx.x+2; + + int step2; + + int tid_top[5], tid_bottom[5]; + #pragma unroll + for (int k = -2; k<3; k++) + { + tid_top[k+2] = tididx + k + disc.base; + tid_bottom[k+2] = tididx + k; + } + + readIn(temper, inRight, inLeft, threadIdx.x, gid); + + __syncthreads(); + + for (int k = (disc.ht-2); k>0; k-=4) + { + if (tididx < (disc.base-k) && tididx >= k) + { + temper[tid_top[2]] = stutterStep(temper, tid_bottom); + } + + step2 = k-2; + __syncthreads(); + + if (tididx < (disc.base-step2) && tididx >= step2) + { + temper[tididx] += finalStep(temper, tid_top); + } + + //Make sure the threads are synced + __syncthreads(); + } + + IC[gid] = temper[tididx]; +} + +template +__global__ void +wholeDiamond(double *inRight, double *inLeft, double *outRight, double *outLeft) +{ + extern __shared__ double temper[]; + + int gid = blockDim.x * blockIdx.x + threadIdx.x; + int tididx = threadIdx.x + 2; + + int step2; + + int tid_top[5], tid_bottom[5]; + #pragma unroll + for (int k = -2; k<3; k++) + { + tid_top[k+2] = tididx + k + disc.base; + tid_bottom[k+2] = tididx + k; + } + + readIn(temper, inRight, inLeft, threadIdx.x, gid); + + __syncthreads(); + + for (int k = (disc.ht-2); k>0; k-=4) + { + if (tididx < (disc.base-k) && tididx >= k) + { + temper[tid_top[2]] = stutterStep(temper, tid_bottom); + } + + step2 = k-2; + __syncthreads(); + + if (tididx < (disc.base-step2) && tididx >= step2) + { + temper[tididx] += finalStep(temper, tid_top); + } + + //Make sure the threads are synced + __syncthreads(); + } + + //-------------------TOP PART------------------------------------------ + + if (threadIdx.x > 1 && threadIdx.x <(blockDim.x-2)) + { + temper[tid_top[2]] = stutterStep(temper, tid_bottom); + } + + __syncthreads(); + + //The initial conditions are timslice 0 so start k at 1. + for (int k = 4; k<(blockDim.x/2); k+=4) + { + if (threadIdx.x < (blockDim.x-k) && threadIdx.x >= k) + { + temper[tididx] += finalStep(temper, tid_top); + } + + step2 = k+2; + __syncthreads(); + + if (threadIdx.x < (blockDim.x-step2) && threadIdx.x >= step2) + { + temper[tid_top[2]] = stutterStep(temper, tid_bottom); + } + + //Make sure the threads are synced + __syncthreads(); + + } + + //After the triangle has been computed, the right and left shared arrays are + //stored in global memory by the global thread ID since (conveniently), + //they're the same size as a warp! + if (SPLIT) + { + writeOutLeft(temper, outRight, outLeft, threadIdx.x, gid, blockDim.x); + } + else + { + writeOutRight(temper, outRight, outLeft, threadIdx.x, gid, blockDim.x); + } +} + +double +classicWrapper(double *aState) +{ + std::cout << " -- Classic Array -- " << std::endl; + + const int aSize = sizeof(double) * rv.dv; + int tstep = 0; + // I'm just going to malloc in here. + + double *dks_in, *dks_out; + + cudaMalloc((void **)&dks_in, aSize); + cudaMalloc((void **)&dks_out, aSize); + + // Copy the initial conditions to the device array. + cudaMemcpy(dks_in, aState, aSize, cudaMemcpyHostToDevice); + double t_eq = 0.0; + + while (t_eq <= rv.tf) + { + classicKS <<< rv.bks, rv.tpb >>> (dks_in, dks_out); + classicKS <<< rv.bks, rv.tpb >>> (dks_out, dks_in); + tstep += 4; + t_eq += rv.dt; + + } + + cudaMemcpy(aState, dks_in, aSize, cudaMemcpyDeviceToHost); + std::cout << tstep << std::endl; + + cudaFree(dks_in); + cudaFree(dks_out); + + return t_eq; +} + + +//The host routine. +double +sweptWrapper(double *aState) +{ + std::cout << " -- Swept Array -- " << std::endl; + + const size_t szSt = sizeof(double); + const size_t smem = (2*rv.tpb+8)*szSt; + + std::cout << "SHARED MEMORY: " << smem << std::endl; + + double *d_IC, *d0_right, *d0_left, *d2_right, *d2_left; + const int aSize = sizeof(double) * rv.dv; + + cudaMalloc((void **)&d_IC, aSize); + cudaMalloc((void **)&d0_right, aSize); + cudaMalloc((void **)&d0_left, aSize); + cudaMalloc((void **)&d2_right, aSize); + cudaMalloc((void **)&d2_left, aSize); + + // Copy the initial conditions to the device array. + cudaMemcpy(d_IC, aState, aSize, cudaMemcpyHostToDevice); + //Start the counter and start the clock. + + //Every other step is a full timestep and each cycle is half tpb steps. + double t_fullstep = 0.25 * rv.dt * (double)rv.tpb; + double t_eq; + + t_eq = t_fullstep; + + upTriangle <<< rv.bks, rv.tpb, smem >>> (d_IC, d0_right, d0_left); + + //Split + wholeDiamond <<< rv.bks, rv.tpb, smem >>> (d0_right, d0_left, d2_right, d2_left); + + // Call the kernels until you reach the iteration limit. + while(t_eq < rv.tf) + { + wholeDiamond <<< rv.bks, rv.tpb, smem >>> (d2_right, d2_left, d0_right, d0_left); + + //Split + wholeDiamond <<< rv.bks, rv.tpb, smem >>> (d0_right, d0_left, d2_right, d2_left); + + t_eq += t_fullstep; + } + + downTriangle <<< rv.bks, rv.tpb, smem >>> (d_IC, d2_right, d2_left); + + cudaMemcpy(aState, d_IC, aSize, cudaMemcpyDeviceToHost); + + cudaFree(d_IC); + cudaFree(d0_right); + cudaFree(d0_left); + cudaFree(d2_right); + cudaFree(d2_left); + std::cout << t_eq/rv.dt << std::endl; + + return t_eq; +} + diff --git a/investigations/flatLongTest/KSAtomic.hpp b/investigations/flatLongTest/KSAtomic.hpp new file mode 100644 index 0000000..825f021 --- /dev/null +++ b/investigations/flatLongTest/KSAtomic.hpp @@ -0,0 +1,232 @@ + + + +struct states { + double u[2]; // Full Step, Midpoint step state variables + double uxx; +}; + +__device__ __forceinline__ +double sumu(states ust, const int ix) { return ust.u[ix] + ust.uxx; } + + +__device__ +void ksStep(states *state, const int idx[3], const int tstep) +{ + const int itx = (tstep ^ 1); + state[idx[1]].u[tstep] = state[idx[1]].u[0] - (HALF * disc.dt * (itx+1)) * + (convect(state[idx[0]].u[itx], state[idx[2]].u[itx]) + + secondDer(sumu(state[idx[0]], itx), sumu(state[idx[2]], itx), sumu(state[idx[1]], itx))); +} + +__device__ +void stepUpdate(states *state, const int idx[3], const int tstep) +{ + const int ts = DIVMOD(tstep); + if (tstep & 1) //Odd - Rslt is 0 for even numbers + { + state[idx[1]].uxx = secondDer(state[idx[0]].u[ts], + state[idx[2]].u[ts], + state[idx[1]].u[ts]); + } + else + { + ksStep(state, idx, ts); + } +} + + +__global__ void classicStep(states *state, const int ts) +{ + auto gidz = idxes<3>(); + + stepUpdate(state, gidz.a, ts); +} + +__global__ void upTriangle(states *state, const int tstep) +{ + extern __shared__ states sharedState[]; + + int gid = blockDim.x * blockIdx.x + threadIdx.x; //Global Thread ID + int mid = blockDim.x >> 1; + const int tidx[3] = {threadIdx.x-1, threadIdx.x, threadIdx.x+1}; + + // Using tidx as tid is kind of confusing for reader but looks valid. + + sharedState[tidx[1]] = state[gid]; + + __syncthreads(); + + for (int k=1; k= k) + { + stepUpdate(sharedState, tidx, tstep + k); + } + __syncthreads(); + } + state[gid] = sharedState[tidx[1]]; +} + + +__global__ +void +downTriangle(states *state, const int tstep) +{ + extern __shared__ states sharedState[]; + + const int tid = threadIdx.x; // Thread index + const int mid = blockDim.x >> 1; // Half of block size + const int base = blockDim.x + 2; + const int gid = blockDim.x * blockIdx.x + tid; + const int tidx[3] = {tid, tid + 1, tid + 2}; + + int tnow = tstep; // read tstep into register. + + if (tid<2) sharedState[tid] = state[disc.modg(gid-1)]; + sharedState[tid+2] = state[disc.modg(gid+1)]; + __syncthreads(); + + for (int k=mid; k>0; k--) + { + if (tidx[1] < (base-k) && tidx[1] >= k) + { + stepUpdate(sharedState, tidx, tnow); + } + tnow++; + __syncthreads(); + } + state[gid] = sharedState[tidx[1]]; +} + +template +__global__ void +wholeDiamond(states *state, const int tstep) +{ + extern __shared__ states sharedState[]; + + const int mid = (blockDim.x >> 1); // Half of block size + const int base = blockDim.x + 2; + const int gid = blockDim.x * blockIdx.x + threadIdx.x + SPLIT*(disc.ht-2); + const int tidx[3] = {threadIdx.x, threadIdx.x + 1, threadIdx.x + 2}; + + int tnow = tstep; + int k; + + if (threadIdx.x<2) sharedState[threadIdx.x] = state[disc.modg(gid-1)]; + sharedState[tidx[2]] = state[disc.modg(gid+1)]; + + __syncthreads(); + + for (k=mid; k>0; k--) + { + if (tidx[1] < (base-k) && tidx[1] >= k) + { + stepUpdate(sharedState, tidx, tnow); + } + tnow++; + __syncthreads(); + } + + for (k=2; k<=mid; k++) + { + if (tidx[1] < (base-k) && tidx[1] >= k) + { + stepUpdate(sharedState, tidx, tnow); + } + tnow++; + __syncthreads(); + } + + state[disc.modg(gid)] = sharedState[tidx[1]]; +} + + + +double classicWrapper(states *aState) +{ + std::cout << " -- Classic Atom -- " << std::endl; + const size_t szSt = sizeof(states); + int tstep = 1; + const int npass = rv.dv * szSt; + + states *dState; + + cudaMalloc((void **) &dState, npass); + + // Copy the initial conditions to the device array. + cudaMemcpy(dState, aState, npass, cudaMemcpyHostToDevice); + + double t_eq = 0.0; + + while (t_eq <= rv.tf) + { + classicStep <<< rv.bks, rv.tpb >>> (dState, tstep); + tstep++; + if (tstep%NSTEPS == 0) t_eq += rv.dt; + } + + cudaMemcpy(aState, dState, npass, cudaMemcpyDeviceToHost); + std::cout << tstep << std::endl; + cudaFree(dState); + + return t_eq; +} + +//The host routine. +double +sweptWrapper(states *aState) +{ + std::cout << " -- Swept Atom -- " << std::endl; + const size_t szSt = sizeof(states); + const size_t smem = (rv.tpb+2)*szSt; + + std::cout << "SHARED MEMORY: " << smem << std::endl; + int tstep = 1; + states *dState; + const int npass = rv.dv * szSt; + + cudaMalloc((void **)&dState, npass); + + // Copy the initial conditions to the device array. + cudaMemcpy(dState, aState, npass, cudaMemcpyHostToDevice); + //Start the counter and start the clock. + + //Every other step is a full timestep and each cycle is half tpb steps. + + int stepi = rv.tpb/2; + double t_fullstep = 0.25 * rv.dt * (double)rv.tpb; + double t_eq; + + t_eq = t_fullstep; + + upTriangle <<< rv.bks, rv.tpb, smem >>> (dState, tstep); + + //Split + wholeDiamond <<< rv.bks, rv.tpb, smem >>> (dState, tstep); + + tstep += stepi; + + // Call the kernels until you reach the iteration limit. + while(t_eq < rv.tf) + { + wholeDiamond <<< rv.bks, rv.tpb, smem >>> (dState, tstep); + + tstep += stepi; + + wholeDiamond <<< rv.bks, rv.tpb, smem >>> (dState, tstep); + + t_eq += t_fullstep; + tstep += stepi; + } + + downTriangle <<< rv.bks, rv.tpb, smem >>>(dState, tstep); + + tstep += stepi; + + cudaMemcpy(aState, dState, npass, cudaMemcpyDeviceToHost); + + cudaFree(dState); + std::cout << tstep-1 << " -- " << t_eq/rv.dt << std::endl; + return t_eq; +} diff --git a/investigations/flatLongTest/KSAtomicPad.hpp b/investigations/flatLongTest/KSAtomicPad.hpp new file mode 100644 index 0000000..e32b5be --- /dev/null +++ b/investigations/flatLongTest/KSAtomicPad.hpp @@ -0,0 +1,255 @@ + +#include + +struct states { + double u[2]; // Full Step, Midpoint step state variables + double uxx; + int tstep; +}; + +__device__ __forceinline__ +double sumu(states ust, const int ix) { return ust.u[ix] + ust.uxx; } + + +__device__ +void ksStep(states *state, const int idx[3], const int tstep) +{ + const int itx = (tstep ^ 1); + state[idx[1]].u[tstep] = state[idx[1]].u[0] - (HALF * disc.dt * (itx+1)) * + (convect(state[idx[0]].u[itx], state[idx[2]].u[itx]) + + secondDer(sumu(state[idx[0]], itx), sumu(state[idx[2]], itx), sumu(state[idx[1]], itx))); +} + +__device__ +void stepUpdate(states *state, const int idx[3]) +{ + const int ts = DIVMOD(state[idx[1]].tstep); + if (state[idx[1]].tstep & 1) //Odd - Rslt is 0 for even numbers + { + state[idx[1]].uxx = secondDer(state[idx[0]].u[ts], + state[idx[2]].u[ts], + state[idx[1]].u[ts]); + } + else + { + ksStep(state, idx, ts); + } + state[idx[1]].tstep++; +} + + +__global__ void classicStep(states *state) +{ + auto gidz = idxes<3>(); + + stepUpdate(state, gidz.a); +} + +__global__ void upTriangle(states *state) +{ + extern __shared__ states sharedState[]; + + int gid = blockDim.x * blockIdx.x + threadIdx.x; //Global Thread ID + int mid = blockDim.x >> 1; + const int tidx[3] = {threadIdx.x-1, threadIdx.x, threadIdx.x+1}; + + // Using tidx as tid is kind of confusing for reader but looks valid. + + sharedState[tidx[1]] = state[gid]; + + __syncthreads(); + + for (int k=1; k= k) + { + stepUpdate(sharedState, tidx); + } + __syncthreads(); + } + state[gid] = sharedState[tidx[1]]; +} + + +__global__ +void +downTriangle(states *state) +{ + extern __shared__ states sharedState[]; + + const int tid = threadIdx.x; // Thread index + const int mid = blockDim.x >> 1; // Half of block size + const int base = blockDim.x + 2; + const int gid = blockDim.x * blockIdx.x + tid; + const int tidx[3] = {tid, tid + 1, tid + 2}; + + + if (tid<2) sharedState[tid] = state[disc.modg(gid-1)]; + sharedState[tid+2] = state[disc.modg(gid+1)]; + __syncthreads(); + + for (int k=mid; k>0; k--) + { + if (tidx[1] < (base-k) && tidx[1] >= k) + { + stepUpdate(sharedState, tidx); + } + __syncthreads(); + } + state[gid] = sharedState[tidx[1]]; +} + +template +__global__ void +wholeDiamond(states *state) +{ + extern __shared__ states sharedState[]; + + const int mid = (blockDim.x >> 1); // Half of block size + const int base = blockDim.x + 2; + const int gid = blockDim.x * blockIdx.x + threadIdx.x + SPLIT*(disc.ht-2); + const int tidx[3] = {threadIdx.x, threadIdx.x + 1, threadIdx.x + 2}; + + int k; + + if (threadIdx.x<2) sharedState[threadIdx.x] = state[disc.modg(gid-1)]; + sharedState[tidx[2]] = state[disc.modg(gid+1)]; + + __syncthreads(); + + for (k=mid; k>0; k--) + { + if (tidx[1] < (base-k) && tidx[1] >= k) + { + stepUpdate(sharedState, tidx); + } + + __syncthreads(); + } + + for (k=2; k<=mid; k++) + { + if (tidx[1] < (base-k) && tidx[1] >= k) + { + stepUpdate(sharedState, tidx); + } + + __syncthreads(); + } + + state[disc.modg(gid)] = sharedState[tidx[1]]; +} + +size_t szSt = sizeof(states); +std::vector qtest(states *hSt) +{ + std::vector vi; + for (int i=0; i>> (dState); + tstep++; + if (tstep%NSTEPS == 0) t_eq += rv.dt; + } + + cudaMemcpy(aState, dState, npass, cudaMemcpyDeviceToHost); + std::cout << tstep << std::endl; + cudaFree(dState); + + std::vector stepss = qtest(aState); + std::cout << "UNIQUES: "; + for (int i : stepss) + std::cout << i << " "; + + std::cout << std::endl; + + return t_eq; +} + +//The host routine. +double +sweptWrapper(states *aState) +{ + std::cout << " -- Swept Atom -- " << std::endl; + const size_t szSt = sizeof(states); + const size_t smem = (rv.tpb+2)*szSt; + + std::cout << "SHARED MEMORY: " << smem << std::endl; + int tstep = 1; + states *dState; + const int npass = rv.dv * szSt; + + cudaMalloc((void **)&dState, npass); + + // Copy the initial conditions to the device array. + cudaMemcpy(dState, aState, npass, cudaMemcpyHostToDevice); + //Start the counter and start the clock. + + //Every other step is a full timestep and each cycle is half tpb steps. + + int stepi = rv.tpb/2; + double t_fullstep = 0.25 * rv.dt * (double)rv.tpb; + double t_eq; + + t_eq = t_fullstep; + + upTriangle <<< rv.bks, rv.tpb, smem >>> (dState); + + //Split + wholeDiamond <<< rv.bks, rv.tpb, smem >>> (dState); + + tstep += stepi; + + + // Call the kernels until you reach the iteration limit. + while(t_eq < rv.tf) + { + wholeDiamond <<< rv.bks, rv.tpb, smem >>> (dState); + + tstep += stepi; + + wholeDiamond <<< rv.bks, rv.tpb, smem >>> (dState); + + t_eq += t_fullstep; + tstep += stepi; + } + + downTriangle <<< rv.bks, rv.tpb, smem >>>(dState); + + cudaMemcpy(aState, dState, npass, cudaMemcpyDeviceToHost); + + cudaFree(dState); + + std::vector stepss = qtest(aState); + std::cout << "UNIQUES: "; + for (int i : stepss) + std::cout << i << " "; + + std::cout << aState[0].tstep << " " << aState[50].tstep << " " << aState[0].tstep << " " << tstep << std::endl; + + return t_eq; +} diff --git a/investigations/flatLongTest/KSmain.cu b/investigations/flatLongTest/KSmain.cu new file mode 100644 index 0000000..cc3d18c --- /dev/null +++ b/investigations/flatLongTest/KSmain.cu @@ -0,0 +1,375 @@ +/** + Copyright (C) 2017 Kyle Niemeyer, niemeyek@oregonstate.edu AND + Daniel Magee, mageed@oregonstate.edu +*/ +/* + This file is distribued under the MIT License. See LICENSE at top level of directory or: . +*/ + +//COMPILE LINE! +// nvcc -o ./bin/KSOut KS1D_SweptShared.cu -gencode arch=compute_35,code=sm_35 -lm -restrict -Xcompiler -fopenmp --ptxas-options=-v + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "cudaUtils.h" +#include + +// #include "matplotlib-cpp/matplotlibcpp.h" + +// namespace plt = matplotlibcpp; + +#ifndef GPUNUM + #define GPUNUM 0 +#endif + +#define QUARTER 0.25 +#define HALF 0.5 +#define ONE 1.0 +#define TWO 2.0 +#define FOUR 4.0 +#define SIX 6.0 + +#define NSTEPS 4 // How many substeps in timestep. + +// Since anyone would need to write a header and functions file, why not just hardwire this. +// If the user's number of steps isn't a power of 2 use the other one. + +#define MODULA(x) x & (NSTEPS-1) +// #define MODULA(x) x % NSTEPS + +#define DIVMOD(x) (MODULA(x)) >> 1 + + +__host__ __device__ +__forceinline__ +int mod(int a, int b) { return (a % b + b) % b; } + +const double dx = 0.5; + +struct discConstants{ + + double dxTimes4; + double dx2; + double dx4; + double dt; + double dt_half; + int base; + int ht; + int idxend; + int nPts; + __host__ __device__ + __forceinline__ + int modg(int gd) { return mod(gd, nPts);} +}; + +__constant__ discConstants disc; + +template +struct ar{ + int a[n]; +}; + +template +__device__ +ar idxes() +{ + const int gid = blockDim.x * blockIdx.x + threadIdx.x; + const int st = n/2; + ar ii; + #pragma unroll + for (int k=0; k tests(double *d1, states *s2, int dv, double tl) +{ + int t = 0; + double absit; + double summit = 0; + double sumd = 0; + std::vector sto; + + for(int k=0; k tl); + summit += absit; + sumd += std::abs(d1[k]); + sto.push_back(absit); + } + + if (!t) + { + std::cout << "EQUAL! " << summit << " OUT OF " << sumd << std::endl; + } + else + { + std::cout << "NOT EQUAL! " << summit << " OUT OF " << sumd << std::endl; + std::cout << t << " out of " << dv << std::endl; + } + return sto; +} + +std::vector tests(double *d1, double *s2, int dv, double tl) +{ + int t = 0; + double absit; + double summit = 0; + double sumd = 0; + std::vector sto; + + for(int k=0; k tl); + summit += absit; + sumd += std::abs(d1[k]); + sto.push_back(absit); + } + + if (!t) + { + std::cout << "EQUAL! " << summit << " OUT-OF " << sumd << std::endl; + } + else + { + std::cout << "NOT EQUAL! " << summit << " OUT-OF " << sumd << std::endl; + std::cout << t << " out of " << dv << std::endl; + } + return sto; +} + +inline void writeTime(const cudaTime &Timer) +{ + if (Timer.nTimes > 10) return; + + std::ofstream timeOut; + timeOut.open("atomicArray.csv", std::ios::app); + timeOut.seekp(0, std::ios::end); + bool starts = (timeOut.tellp() == 0); + if (starts) timeOut << "tpb,dv,sweptArray,sweptAtomic,classicArray,classicAtomic\n"; + + timeOut << rv.tpb << "," << rv.dv; + for (int i=0; i vatom; + std::vector varray; + std::vector xa; + + cudaMemcpyToSymbol(disc,&dsc,sizeof(dsc)); + + cudaTime *timer = new cudaTime(); + double *arrayState, *permState, *tradeState; + states *atomicState; + double tfm; + double tol=1.0e-3; + + const int arSize = sizeof(double) * rv.dv; + const int atomSize = sizeof(states) * rv.dv; + + cudaHostAlloc((void **) &permState, arSize, cudaHostAllocDefault); + cudaHostAlloc((void **) &arrayState, arSize, cudaHostAllocDefault); + cudaHostAlloc((void **) &atomicState, atomSize, cudaHostAllocDefault); + cudaHostAlloc((void **) &tradeState, arSize, cudaHostAllocDefault); + + double idou, nTimes; + for (int k=0; ktinit(); + tfm = sweptWrapper(arrayState); + nTimes = tfm/rv.dt; + std::cout << tfm << " " << nTimes << std::endl; + timer->tfinal(nTimes); + + cudaKernelCheck(); + + timer->tinit(); + tfm = sweptWrapper(atomicState); + nTimes = tfm/rv.dt; + std::cout << tfm << " " << nTimes << std::endl; + timer->tfinal(nTimes); + + cudaKernelCheck(); + + std::vector ipon = tests(arrayState, atomicState, rv.dv, tol); + + rv.tf = tfm-(0.25*rv.dt); // RESET IT + + // plt::named_plot("Differences", xa, ipon, "-r"); + // plt::legend(); + // plt::show(); + + std::cout << *timer << std::endl; + + std::cout << " ----------------------- " << std::endl; + std::cout << " CLASSIC TEST " << std::endl; + + for (int k=0; ktinit(); + tfm = classicWrapper(arrayState); + nTimes = tfm/rv.dt; + std::cout << tfm << " " << nTimes << std::endl; + timer->tfinal(nTimes); + + cudaKernelCheck(); + + std::cout << " Swept vs Classic ARRAY Agreement: "; + tests(tradeState, arrayState, rv.dv, tol); + + for (int k=0; ktinit(); + tfm = classicWrapper(atomicState); + nTimes = tfm/rv.dt; + std::cout << tfm << " " << nTimes << std::endl; + timer->tfinal(nTimes); + + cudaKernelCheck(); + std::cout << " Swept vs Classic ATOMIC Agreement: "; + tests(tradeState, atomicState, rv.dv, tol); + + ipon = tests(arrayState, atomicState, rv.dv, tol); + // plt::named_plot("Differences", xa, ipon, "-r"); + // plt::legend(); + // plt::show(); + + std::cout << *timer << std::endl; + for (int k = 0; k5) writeTime(*timer); + + // plt::named_plot("Atomic", xa, vatom, "-r"); + // plt::named_plot("Array", xa, varray, "-b"); + // plt::title("Swept"); + // plt::legend(); + // plt::show(); + + std::cout << " ----------------------- " << std::endl; + + cudaFreeHost(arrayState); + cudaFreeHost(atomicState); + cudaFreeHost(permState); + + delete timer; + + return 0; +} \ No newline at end of file diff --git a/investigations/flatLongTest/Makefile b/investigations/flatLongTest/Makefile new file mode 100644 index 0000000..b2a2cd9 --- /dev/null +++ b/investigations/flatLongTest/Makefile @@ -0,0 +1,27 @@ + + +CUDAFLAGS := -gencode arch=compute_35,code=sm_35 -restrict -O3 -lm -std=c++11 -w --ptxas-options=-v + +ifndef NOPLOT + PLOTSS=-I/usr/include/python2.7 -lpython2.7 +endif + +ifdef PD + EXTO:=PAD +else + PD= + EXTO:= +endif + +# It will put the compiled programs in the bin subdirectory +PROJECT_DIR = $(shell pwd) +OUT_DIR := ./bin +#$(PROJECT_DIR) + +default: $(OUT_DIR)/KSmain$(EXTO) + +$(OUT_DIR)/%$(EXTO): %.cu $(PROJECT_DIR)/*.hpp + nvcc $< -o $@ $(CUDAFLAGS) $(PLOTSS) $(PD) + +clean: + rm bin/* diff --git a/investigations/flatLongTest/atomicArray.csv b/investigations/flatLongTest/atomicArray.csv new file mode 100644 index 0000000..38325e4 --- /dev/null +++ b/investigations/flatLongTest/atomicArray.csv @@ -0,0 +1,61 @@ +tpb,dv,sweptArray,sweptAtomic,classicArray,classicAtomic +32,2048,2.40007,6.63434,10.4235,29.6741 +32,4096,2.6025,7.51959,12.404,32.9315 +32,8192,3.67676,12.7995,13.6601,47.2752 +32,16384,5.08457,19.3627,16.5338,66.6656 +32,32768,8.15502,33.3647,23.7059,110.6 +32,65536,14.7692,62.9635,37.4083,210.004 +32,131072,27.6137,119.673,71.1649,384.872 +32,262144,53.1766,232.851,129.996,737.279 +32,524288,104.275,459.421,247.739,1443.36 +32,1048576,207.71,912.714,483.13,2847.12 +64,2048,1.88515,5.63549,10.5808,29.8034 +64,4096,2.0648,6.41555,12.2258,35.0574 +64,8192,2.48611,9.1848,12.1253,44.8238 +64,16384,3.5951,24.7273,15.6067,65.5351 +64,32768,5.7921,46.2661,20.1162,100.891 +64,65536,10.2214,86.9771,29.9518,179.258 +64,131072,19.0078,171.308,52.3312,330.095 +64,262144,36.4424,339.954,92.4432,629.662 +64,524288,71.3321,677.602,172.641,1228.7 +64,1048576,141.045,1352.59,333.048,2426.57 +128,2048,1.53293,5.15251,10.1668,30.46 +128,4096,1.6771,6.14865,12.2177,37.4525 +128,8192,2.03253,8.64294,12.336,45.7973 +128,16384,2.8478,18.0308,15.5637,64.6168 +128,32768,4.58072,44.6867,20.4183,104.386 +128,65536,7.99725,83.013,29.927,182.722 +128,131072,14.8442,161.414,50.0962,335.065 +128,262144,28.267,321.015,88.2068,637.475 +128,524288,55.1163,638.64,164.32,1242.33 +128,1048576,108.843,1268.33,316.867,2452.8 +256,2048,1.3505,4.95406,11.3153,30.8443 +256,4096,1.57635,7.46392,12.4013,40.9095 +256,8192,1.92905,9.21015,13.8858,49.8651 +256,16384,2.6322,16.0087,15.3831,68.2373 +256,32768,4.22022,41.0684,20.4245,104.76 +256,65536,7.40611,68.8514,30.3287,186.47 +256,131072,13.5593,128.168,50.4761,339.494 +256,262144,25.8775,254.894,88.6498,642.157 +256,524288,50.4286,508.592,165.387,1247.21 +256,1048576,99.5966,1017.17,318.505,2459.41 +512,2048,1.52177,7.71707,12.0663,40.6267 +512,4096,1.52928,7.7432,12.3056,41.3265 +512,8192,2.00342,11.3485,13.6711,58.1932 +512,16384,2.51893,16.2842,17.0839,78.3168 +512,32768,4.5901,34.3451,21.7042,115.247 +512,65536,8.04788,56.7913,30.7422,195.298 +512,131072,14.6278,112.054,51.616,358.37 +512,262144,26.6582,218.602,89.9169,675.46 +512,524288,51.6933,419.75,167.19,1305.18 +512,1048576,101.67,826.456,321.632,2562.83 +1024,2048,1.97449,11.864,14.3608,62.6596 +1024,4096,1.98058,11.9167,14.5522,62.908 +1024,8192,1.99942,11.9681,15.0342,63.3168 +1024,16384,3.04024,24.1762,18.346,99.7006 +1024,32768,4.95251,36.956,23.664,138.721 +1024,65536,8.64458,62.194,33.9697,227.136 +1024,131072,14.7341,113.775,55.9044,395.561 +1024,262144,27.0346,215.657,97.4133,738.591 +1024,524288,52.0674,419.875,179.548,1418.75 +1024,1048576,103.674,828.409,344.129,2786.37 diff --git a/investigations/flatLongTest/atomicPadding.csv b/investigations/flatLongTest/atomicPadding.csv new file mode 100644 index 0000000..aac4add --- /dev/null +++ b/investigations/flatLongTest/atomicPadding.csv @@ -0,0 +1,61 @@ +tpb,dv,classicArray,classicAtomic,sweptArray,sweptAtomic +32,2048,10.5145,37.2119,2.35491,10.6148 +32,4096,12.3915,45.3359,2.58066,11.9703 +32,8192,13.5635,67.7075,3.66284,20.8072 +32,16384,16.8177,104.992,5.08569,32.5191 +32,32768,23.6894,182.908,8.12744,56.7365 +32,65536,37.5551,337.811,14.7585,104.285 +32,131072,71.0992,624.913,27.6128,201.835 +32,262144,130.013,1212.95,53.1499,396.345 +32,524288,247.638,2379.28,104.263,784.76 +32,1048576,482.919,4718.92,206.56,1562.93 +64,2048,10.4119,37.267,1.85109,9.61128 +64,4096,12.1452,45.4394,2.0203,10.7847 +64,8192,12.104,62.9761,2.472,14.5321 +64,16384,15.5412,102.015,3.56851,43.8224 +64,32768,20.223,172.222,5.78267,80.5145 +64,65536,29.8674,316.25,10.2022,153.761 +64,131072,52.4047,588.891,19.0093,305.169 +64,262144,92.2957,1136.43,36.4602,610.572 +64,524288,172.54,2230.38,71.3658,1221.9 +64,1048576,332.946,4418.75,141.111,2443.84 +128,2048,10.2113,41.1027,1.52081,10.6074 +128,4096,12.2082,49.9857,1.66086,12.5047 +128,8192,12.3124,65.5395,2.02396,14.9668 +128,16384,15.6147,99.2663,2.84368,29.5843 +128,32768,20.4046,175.152,4.57271,61.0777 +128,65536,29.8627,326.316,8.01774,118.123 +128,131072,49.9871,607.73,14.8674,226.007 +128,262144,88.2125,1159.59,28.2828,445.944 +128,524288,164.271,2260.31,55.136,892.943 +128,1048576,316.65,4457.8,108.906,1784.42 +256,2048,11.3281,41.7556,1.34541,11.2674 +256,4096,12.3938,56.648,1.59529,14.5261 +256,8192,13.7884,72.7568,1.91884,17.2108 +256,16384,15.447,106.5,2.6278,26.6545 +256,32768,20.4602,178.801,4.21093,49.3025 +256,65536,30.1686,329.489,7.3595,98.4544 +256,131072,50.2862,606.564,13.6116,180.673 +256,262144,88.6015,1159.79,25.8546,348.425 +256,524288,165.201,2259.47,50.4757,682.442 +256,1048576,318.283,4456.44,99.6421,1355.42 +512,2048,12.1479,57.5493,1.51909,15.2436 +512,4096,12.1921,57.9564,1.52334,15.3118 +512,8192,13.5092,87.884,2.00571,20.6509 +512,16384,17.1218,121.834,2.60008,36.8136 +512,32768,21.7258,187.141,4.68618,61.4165 +512,65536,30.7689,342.344,7.92118,103.953 +512,131072,51.5981,625.877,14.7362,189.635 +512,262144,89.9209,1175.56,26.6776,364.381 +512,524288,167.145,2275.55,52.309,710.869 +512,1048576,321.477,4482.63,102.133,1410.78 +1024,2048,14.3089,88.6339,1.96935,21.3711 +1024,4096,14.6172,89.0899,1.97433,21.4486 +1024,8192,15.0064,90.181,1.98356,21.5563 +1024,16384,18.3972,155.839,3.12533,41.9673 +1024,32768,23.6427,221.682,4.9535,63.0426 +1024,65536,34.0302,386.226,8.7269,105.694 +1024,131072,55.8035,699.614,15.1361,193.193 +1024,262144,97.2882,1268.49,27.3374,377.109 +1024,524288,179.472,2443.64,52.7626,733.811 +1024,1048576,344.118,4774.14,104.366,1465.54 diff --git a/investigations/flatLongTest/cudaUtils.h b/investigations/flatLongTest/cudaUtils.h new file mode 100644 index 0000000..26f940b --- /dev/null +++ b/investigations/flatLongTest/cudaUtils.h @@ -0,0 +1,175 @@ +/* + UTILITIES FOR HSWEEP. TIMER and DRIVER AGREEMENT CHECKER. +*/ + +#ifndef SWEEPUTIL_H +#define SWEEPUTIL_H +#include +#include +#include + +#include +#include + +#include +#include +#include + +// template< typename T > +// void check(T result, char const *const func, const char *const file, int const line) +// { +// if (result) +// { +// fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", +// file, line, static_cast(result), _cudaGetErrorEnum(result), func); +// DEVICE_RESET +// // Make sure we call CUDA Device Reset before exiting +// exit(EXIT_FAILURE); +// } +// } + +// #define checkCudaErrors(val) check ( (val), #val, __FILE__, __LINE__ ) + +void cudaKernelCheck(bool gpuq=true) +{ + if (gpuq) + { + cudaError_t error = cudaGetLastError(); + if(error != cudaSuccess) + { + // print the CUDA error message and exit + printf("CUDA error: %s\n", cudaGetErrorString(error)); + exit(-1); + } + cudaDeviceSynchronize(); + } +} + +// From cppOverload in CUDASamples. +#define OUTPUT_ATTR(attr) \ + printf("Shared Size: %d\n", (int)attr.sharedSizeBytes); \ + printf("Constant Size: %d\n", (int)attr.constSizeBytes); \ + printf("Local Size: %d\n", (int)attr.localSizeBytes); \ + printf("Max Threads Per Block: %d\n", attr.maxThreadsPerBlock); \ + printf("Number of Registers: %d\n", attr.numRegs); \ + printf("PTX Version: %d\n", attr.ptxVersion); \ + printf("Binary Version: %d\n", attr.binaryVersion); \ + +void cudaRunCheck() +{ + int rv, dv; + cudaDriverGetVersion(&dv); + cudaRuntimeGetVersion(&rv); + printf("CUDA Driver Version / Runtime Version --- %d.%d / %d.%d\n", dv/1000, (dv%100)/10, rv/1000, (rv%100)/10); +} + +void factor(int n, int* factors) +{ + int sq = std::sqrt(n); + int outf = n/sq; + while (outf * sq != n) + { + sq--; + outf=n/sq; + } + factors[0] = sq; + factors[1] = outf; +} + +struct cudaTime +{ + std::vector times; + cudaEvent_t start, stop; + int nTimes=0; + float ti; + std::string typ = "GPU"; + + cudaTime() { + cudaEventCreate( &start ); + cudaEventCreate( &stop ); + }; + ~cudaTime() + { + cudaEventDestroy( start ); + cudaEventDestroy( stop ); + }; + + void tinit(){ cudaEventRecord( start, 0); }; + + void tfinal(double nTimesteps=1.0) { + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime( &ti, start, stop); + ti *= 1.0e3; + nTimes++; + times.push_back(ti/nTimesteps); + }; + + float getLastTime() + { + return times.back(); + }; + + float avgt() { + double sumt = 0.0; + for (auto itime: times) sumt += itime; + return sumt/(double)times.size(); + }; + + friend std::ostream& operator<< (std::ostream &out, const cudaTime &ct); +}; + + +std::ostream& operator<< (std::ostream &out, const cudaTime &ct) +{ + out << "Simulation Time (us) - "; + for (auto itt: ct.times) + out << itt << " | "; + + return out; +} + +#ifdef MPIS +#include + +struct mpiTime +{ + std::vector times; + double ti; + + std::string typ = "CPU"; + + void tinit(){ ti = MPI_Wtime(); } + + void tfinal() { times.push_back((MPI_Wtime()-ti)*1.e6); } + + int avgt() { + double sumt = 0.0; + for (auto itime: times) sumt += itime; + return sumt/(double)times.size(); + }; +}; + +#endif +// void atomicWrite(std::string st, std::vector t) +// { +// FILE *tTemp; +// MPI_Barrier(MPI_COMM_WORLD); + +// for (int k=0; k. +*/ + +//COMPILE LINE! +// nvcc -o ./bin/KSOut KS1D_SweptShared.cu -gencode arch=compute_35,code=sm_35 -lm -restrict -Xcompiler -fopenmp --ptxas-options=-v + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef GPUNUM + #define GPUNUM 0 +#endif + +#define QUARTER 0.25 +#define HALF 0.5 +#define ONE 1.0 +#define TWO 2.0 +#define FOUR 4.0 +#define SIX 6.0 + +#define NSTEPS 4 // How many substeps in timestep. + +// Since anyone would need to write a header and functions file, why not just hardwire this. +// If the user's number of steps isn't a power of 2 use the other one. + +#define MODULA(x) x & (NSTEPS-1) +// #define MODULA(x) x % NSTEPS + +#define DIVMOD(x) (MODULA(x)) >> 1 + +const double dx = 0.5; + +struct discConstants{ + + double dxTimes4; + double dx2; + double dx4; + double dt; + double dt_half; + int base; + int ht; + int idxend; +}; + +__constant__ discConstants disc; + +//Initial condition. +__host__ +double initFun(double xnode) +{ + return TWO * cos(19.0*xnode*M_PI/128.0); +} + +__device__ __forceinline__ +double fourthDer(double tfarLeft, double tLeft, double tCenter, double tRight, double tfarRight) +{ + return (tfarLeft - FOUR*tLeft + SIX*tCenter - FOUR*tRight + tfarRight)*(disc.dx4); +} + +__device__ __forceinline__ +double secondDer(double tLeft, double tRight, double tCenter) +{ + return (tLeft + tRight - TWO*tCenter)*(disc.dx2); +} + +__device__ __forceinline__ +double convect(double tLeft, double tRight) +{ + return (tRight*tRight - tLeft*tLeft)*(disc.dxTimes4); +} + +struct runVar{ + int bks, tpb, dv; + double dt, tf, lx; + size_t smem; + + void printme() + { + cout << "KS ---" << endl; + cout << "#Blocks: " << bks << " | Length: " << lx << " | dt/dx: " << dt/dx << endl; + cout << "TPB: " << tpb << " Grid Size " << dv << endl; + } +} rv; + +#include "KSAtomic.hpp" + +int main( int argc, char *argv[]) +{ + + cudaSetDevice(GPUNUM); + // if cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); + + rv.dv = atoi(argv[1]); //Number of spatial points + rv.tpb = atoi(argv[2]); //Threads per Block + rv.dt = atof(argv[3]); //delta T timestep + rv.tf = atof(argv[4]) - 0.25*rv.dt; //Finish time + rv.bks = rv.dv/rv.tpb; //The number of blocks + rv.lx = rv.dv*dx; + rv.smem = (2*rv.tpb+8)*szSt; + + rv.printme(); + + discConstants dsc = { + ONE/(FOUR*dx), //dx + ONE/(dx*dx), //dx^2 + ONE/(dx*dx*dx*dx), //dx^4 + rv.dt, //dt + rv.dt*0.5, //dt half + rv.tpb + 4, //length of row of shared array + (rv.tpb+4)/2, //midpoint of shared array row + rv.dv-1 //last global thread id + }; + + cudaMemcpyToSymbol(disc,&dsc,sizeof(dsc)); + + double *arrayState; + states *atomicState; + + // Initialize, time, write out. Compare Array to Atomic. + // YOU HAVE 1 HOUR. + + cudaFreeHost(arrayState); + cudaFreeHost(atomicState); + + return 0; + +} \ No newline at end of file diff --git a/investigations/flatLongTest/rslts/atomicArray.csv b/investigations/flatLongTest/rslts/atomicArray.csv new file mode 100644 index 0000000..38325e4 --- /dev/null +++ b/investigations/flatLongTest/rslts/atomicArray.csv @@ -0,0 +1,61 @@ +tpb,dv,sweptArray,sweptAtomic,classicArray,classicAtomic +32,2048,2.40007,6.63434,10.4235,29.6741 +32,4096,2.6025,7.51959,12.404,32.9315 +32,8192,3.67676,12.7995,13.6601,47.2752 +32,16384,5.08457,19.3627,16.5338,66.6656 +32,32768,8.15502,33.3647,23.7059,110.6 +32,65536,14.7692,62.9635,37.4083,210.004 +32,131072,27.6137,119.673,71.1649,384.872 +32,262144,53.1766,232.851,129.996,737.279 +32,524288,104.275,459.421,247.739,1443.36 +32,1048576,207.71,912.714,483.13,2847.12 +64,2048,1.88515,5.63549,10.5808,29.8034 +64,4096,2.0648,6.41555,12.2258,35.0574 +64,8192,2.48611,9.1848,12.1253,44.8238 +64,16384,3.5951,24.7273,15.6067,65.5351 +64,32768,5.7921,46.2661,20.1162,100.891 +64,65536,10.2214,86.9771,29.9518,179.258 +64,131072,19.0078,171.308,52.3312,330.095 +64,262144,36.4424,339.954,92.4432,629.662 +64,524288,71.3321,677.602,172.641,1228.7 +64,1048576,141.045,1352.59,333.048,2426.57 +128,2048,1.53293,5.15251,10.1668,30.46 +128,4096,1.6771,6.14865,12.2177,37.4525 +128,8192,2.03253,8.64294,12.336,45.7973 +128,16384,2.8478,18.0308,15.5637,64.6168 +128,32768,4.58072,44.6867,20.4183,104.386 +128,65536,7.99725,83.013,29.927,182.722 +128,131072,14.8442,161.414,50.0962,335.065 +128,262144,28.267,321.015,88.2068,637.475 +128,524288,55.1163,638.64,164.32,1242.33 +128,1048576,108.843,1268.33,316.867,2452.8 +256,2048,1.3505,4.95406,11.3153,30.8443 +256,4096,1.57635,7.46392,12.4013,40.9095 +256,8192,1.92905,9.21015,13.8858,49.8651 +256,16384,2.6322,16.0087,15.3831,68.2373 +256,32768,4.22022,41.0684,20.4245,104.76 +256,65536,7.40611,68.8514,30.3287,186.47 +256,131072,13.5593,128.168,50.4761,339.494 +256,262144,25.8775,254.894,88.6498,642.157 +256,524288,50.4286,508.592,165.387,1247.21 +256,1048576,99.5966,1017.17,318.505,2459.41 +512,2048,1.52177,7.71707,12.0663,40.6267 +512,4096,1.52928,7.7432,12.3056,41.3265 +512,8192,2.00342,11.3485,13.6711,58.1932 +512,16384,2.51893,16.2842,17.0839,78.3168 +512,32768,4.5901,34.3451,21.7042,115.247 +512,65536,8.04788,56.7913,30.7422,195.298 +512,131072,14.6278,112.054,51.616,358.37 +512,262144,26.6582,218.602,89.9169,675.46 +512,524288,51.6933,419.75,167.19,1305.18 +512,1048576,101.67,826.456,321.632,2562.83 +1024,2048,1.97449,11.864,14.3608,62.6596 +1024,4096,1.98058,11.9167,14.5522,62.908 +1024,8192,1.99942,11.9681,15.0342,63.3168 +1024,16384,3.04024,24.1762,18.346,99.7006 +1024,32768,4.95251,36.956,23.664,138.721 +1024,65536,8.64458,62.194,33.9697,227.136 +1024,131072,14.7341,113.775,55.9044,395.561 +1024,262144,27.0346,215.657,97.4133,738.591 +1024,524288,52.0674,419.875,179.548,1418.75 +1024,1048576,103.674,828.409,344.129,2786.37 diff --git a/investigations/flatLongTest/rslts/atomicPadding.csv b/investigations/flatLongTest/rslts/atomicPadding.csv new file mode 100644 index 0000000..aac4add --- /dev/null +++ b/investigations/flatLongTest/rslts/atomicPadding.csv @@ -0,0 +1,61 @@ +tpb,dv,classicArray,classicAtomic,sweptArray,sweptAtomic +32,2048,10.5145,37.2119,2.35491,10.6148 +32,4096,12.3915,45.3359,2.58066,11.9703 +32,8192,13.5635,67.7075,3.66284,20.8072 +32,16384,16.8177,104.992,5.08569,32.5191 +32,32768,23.6894,182.908,8.12744,56.7365 +32,65536,37.5551,337.811,14.7585,104.285 +32,131072,71.0992,624.913,27.6128,201.835 +32,262144,130.013,1212.95,53.1499,396.345 +32,524288,247.638,2379.28,104.263,784.76 +32,1048576,482.919,4718.92,206.56,1562.93 +64,2048,10.4119,37.267,1.85109,9.61128 +64,4096,12.1452,45.4394,2.0203,10.7847 +64,8192,12.104,62.9761,2.472,14.5321 +64,16384,15.5412,102.015,3.56851,43.8224 +64,32768,20.223,172.222,5.78267,80.5145 +64,65536,29.8674,316.25,10.2022,153.761 +64,131072,52.4047,588.891,19.0093,305.169 +64,262144,92.2957,1136.43,36.4602,610.572 +64,524288,172.54,2230.38,71.3658,1221.9 +64,1048576,332.946,4418.75,141.111,2443.84 +128,2048,10.2113,41.1027,1.52081,10.6074 +128,4096,12.2082,49.9857,1.66086,12.5047 +128,8192,12.3124,65.5395,2.02396,14.9668 +128,16384,15.6147,99.2663,2.84368,29.5843 +128,32768,20.4046,175.152,4.57271,61.0777 +128,65536,29.8627,326.316,8.01774,118.123 +128,131072,49.9871,607.73,14.8674,226.007 +128,262144,88.2125,1159.59,28.2828,445.944 +128,524288,164.271,2260.31,55.136,892.943 +128,1048576,316.65,4457.8,108.906,1784.42 +256,2048,11.3281,41.7556,1.34541,11.2674 +256,4096,12.3938,56.648,1.59529,14.5261 +256,8192,13.7884,72.7568,1.91884,17.2108 +256,16384,15.447,106.5,2.6278,26.6545 +256,32768,20.4602,178.801,4.21093,49.3025 +256,65536,30.1686,329.489,7.3595,98.4544 +256,131072,50.2862,606.564,13.6116,180.673 +256,262144,88.6015,1159.79,25.8546,348.425 +256,524288,165.201,2259.47,50.4757,682.442 +256,1048576,318.283,4456.44,99.6421,1355.42 +512,2048,12.1479,57.5493,1.51909,15.2436 +512,4096,12.1921,57.9564,1.52334,15.3118 +512,8192,13.5092,87.884,2.00571,20.6509 +512,16384,17.1218,121.834,2.60008,36.8136 +512,32768,21.7258,187.141,4.68618,61.4165 +512,65536,30.7689,342.344,7.92118,103.953 +512,131072,51.5981,625.877,14.7362,189.635 +512,262144,89.9209,1175.56,26.6776,364.381 +512,524288,167.145,2275.55,52.309,710.869 +512,1048576,321.477,4482.63,102.133,1410.78 +1024,2048,14.3089,88.6339,1.96935,21.3711 +1024,4096,14.6172,89.0899,1.97433,21.4486 +1024,8192,15.0064,90.181,1.98356,21.5563 +1024,16384,18.3972,155.839,3.12533,41.9673 +1024,32768,23.6427,221.682,4.9535,63.0426 +1024,65536,34.0302,386.226,8.7269,105.694 +1024,131072,55.8035,699.614,15.1361,193.193 +1024,262144,97.2882,1268.49,27.3374,377.109 +1024,524288,179.472,2443.64,52.7626,733.811 +1024,1048576,344.118,4774.14,104.366,1465.54 diff --git a/investigations/flatLongTest/runatom.py b/investigations/flatLongTest/runatom.py new file mode 100644 index 0000000..5eda301 --- /dev/null +++ b/investigations/flatLongTest/runatom.py @@ -0,0 +1,98 @@ + +import subprocess as sp +import shlex +import os +import os.path as op +import sys +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import time + +thispath = op.abspath(op.dirname(__file__)) +impath = op.join(thispath, "images") + +ext = ".pdf" + +plt.style.use("swept.mplstyle") +def savePlot(fh, name): + plotfile = op.join(impath, name + ext) + fh.savefig(plotfile, dpi=200, bbox_inches="tight") + + +def plotdf(df, subp, name, kwarg={}): + icols = df.columns.values + dframe = {} + fx, ax = plt.subplots(*subp, figsize=(10,8)) + axx = ax.ravel() + ccols = list(icols[:2]) + for ix, ic in enumerate(icols[2:]): + cidx = ccols+[ic] + ibo = df[cidx].pivot(*cidx).T + dframe[ic] = ibo + ibo.plot(ax=axx[ix], title=ic, **kwarg) + + fx.subplots_adjust(bottom=0.08, right=0.85, top=0.9, wspace=0.15, hspace=0.3) + fx.tight_layout() + savePlot(fx, name) + return dframe + +def minkey(df, ky): + dfo = df.groupby(ky).min(axis=1).drop('tpb', axis=1) + return dfo + + +def quickPlot(f): + idf = pd.read_csv(f, header=0) + idf.rename({"dv": "gridSize"}, axis=1) + icols = idf.columns.values + name = f.split(".")[0] + plotdf(idf, [2,2], name+"Raw", kwarg={"loglog": True}) + speedup = idf[icols[:2]] + speedup[icols[2][:5]] = idf[icols[3]] / idf[icols[2]] + speedup[icols[4][:5]] = idf[icols[5]] / idf[icols[4]] + # speedup.columns + plotdf(speedup, [1, 2], name+"Speedup", kwarg={"logx": True}) + + return idf, speedup + +def getPlotBest(f): + idf = pd.read_csv(f, header=0) + minz = idf.groupby('dv').min().drop('tpb', axis=1) + mcol = minz.columns.values + speedz = pd.DataFrame() + print(minz) + speedz["Swept"] = minz[mcol[1]]/minz[mcol[0]] + speedz["Classic"] = minz[mcol[3]]/minz[mcol[2]] + ax = speedz.plot(logx=True) + ax.set_ylabel("Speedup") + ax.set_xlabel("Number of Spatial Points") + savePlot(ax.figure, "Speedup" + f.split(".")[0][-6:]) + + +if __name__ == "__main__": + + if len(sys.argv) > 1: + print("Just Reloading") + sys.exit(-1) + + tpb = [2**k for k in range(5,11)] + nx = [2**k for k in range(11,21)] + + ex = "./bin/KSmain " + + times = " 0.001 10" + + for t in tpb: + for n in nx: + strnia = ex + "{:d} {:d}".format(n, t) + times + + exstr = shlex.split(strnia) + proc = sp.Popen(exstr) + sp.Popen.wait(proc) + time.sleep(3) + + ss = "atomicArray.csv" + # framed = quickPlot(ss) + # plt.show() + diff --git a/investigations/flatLongTest/swept.mplstyle b/investigations/flatLongTest/swept.mplstyle new file mode 100644 index 0000000..fd15355 --- /dev/null +++ b/investigations/flatLongTest/swept.mplstyle @@ -0,0 +1,17 @@ + +lines.linewidth: 3 +lines.markersize: 8 +figure.figsize: 6, 4 +figure.titlesize: xx-large +figure.titleweight: bold +axes.titlesize: x-large +axes.labelsize: large +xtick.major.size: 6 +xtick.labelsize: large +ytick.labelsize: large +ytick.major.size: 6 +grid.alpha: 0.5 +axes.grid: True +savefig.dpi: 1000 +savefig.bbox: tight +font.family: serif diff --git a/runtools/timing_analysis.py b/runtools/timing_analysis.py index dd64b00..a9a06a0 100644 --- a/runtools/timing_analysis.py +++ b/runtools/timing_analysis.py @@ -153,6 +153,7 @@ def getFitRs(dfp): sp, inter, r2, p, std_err = scipy.stats.linregress(np.array(dfa.index), dfa[s]) inter = np.exp(inter) spcol[s] = [sp, inter, r2] + spcol.index = ["A", "b", "r2"] return spcol