diff --git a/investigations/flatLongTest/EulerArray.hpp b/investigations/flatLongTest/EulerArray.hpp
new file mode 100644
index 0000000..e69de29
diff --git a/investigations/flatLongTest/EulerAtomic.hpp b/investigations/flatLongTest/EulerAtomic.hpp
new file mode 100644
index 0000000..e69de29
diff --git a/investigations/flatLongTest/KSArray.hpp b/investigations/flatLongTest/KSArray.hpp
new file mode 100644
index 0000000..7b5c2b5
--- /dev/null
+++ b/investigations/flatLongTest/KSArray.hpp
@@ -0,0 +1,342 @@
+
+
+
+//Read in the data from the global right/left variables to the shared temper variable.
+__device__ __forceinline__
+void readIn(double *temp, const double *rights, const double *lefts, int td, int gd)
+{
+	int leftidx = disc.ht + (((td>>2) & 1) * disc.base) + (td & 3) - (4 + ((td>>2)<<1));
+	int rightidx = disc.ht + (((td>>2) & 1) * disc.base) + ((td>>2)<<1) + (td & 3);
+
+	temp[leftidx] = rights[gd];
+	temp[rightidx] = lefts[gd];
+}
+
+__device__ __forceinline__
+void writeOutRight(double *temp, double *rights, double *lefts, int td, int gd, int bd)
+{
+	int gdskew = (gd + bd) & disc.idxend;
+    int leftidx = (((td>>2) & 1)  * disc.base) + ((td>>2)<<1) + (td & 3) + 2;
+    int rightidx = (disc.base - 6) + (((td>>2) & 1)  * disc.base) + (td & 3) - ((td>>2)<<1);
+	rights[gdskew] = temp[rightidx];
+	lefts[gd] = temp[leftidx];
+}
+
+__device__ __forceinline__
+void writeOutLeft(double *temp, double *rights, double *lefts, int td, int gd, int bd)
+{
+	int gdskew = (gd - bd) & disc.idxend;
+    int leftidx = (((td>>2) & 1)  * disc.base) + ((td>>2)<<1) + (td & 3) + 2;
+    int rightidx = (disc.base-6) + (((td>>2) & 1)  * disc.base) + (td & 3) - ((td>>2)<<1);
+	rights[gd] = temp[rightidx];
+	lefts[gdskew] = temp[leftidx];
+}
+
+__device__ __forceinline__
+double stutterStep(const double *u, const int loc[5])
+{
+	return u[loc[2]] - disc.dt_half * (convect(u[loc[1]], u[loc[3]]) + secondDer(u[loc[1]], u[loc[3]], u[loc[2]]) +
+		fourthDer(u[loc[0]], u[loc[1]], u[loc[2]], u[loc[3]], u[loc[4]]));
+}
+
+__device__
+double finalStep(const double *u, const int loc[5])
+{
+	return (-disc.dt * (convect(u[loc[1]], u[loc[3]]) + secondDer(u[loc[1]], u[loc[3]], u[loc[2]]) +
+		fourthDer(u[loc[0]], u[loc[1]], u[loc[2]], u[loc[3]], u[loc[4]])));
+}
+
+//Classic
+template <bool FINAL>
+__global__ void
+classicKS(const double *ks_in, double *ks_out)
+{
+	auto gidz = idxes<5>();
+	
+	if (FINAL)	 ks_out[gidz.a[2]] +=  finalStep(ks_in, gidz.a);
+	else 		 ks_out[gidz.a[2]]  =  stutterStep(ks_in, gidz.a);
+}
+
+__global__
+void
+upTriangle(const double *IC, double *outRight, double *outLeft)
+{
+	extern __shared__ double temper[];
+
+	int gid = blockDim.x * blockIdx.x + threadIdx.x; //Global Thread ID
+	int tididx = threadIdx.x + 2;
+
+	int step2;
+
+	int tid_top[5], tid_bottom[5];
+	#pragma unroll
+	for (int k = -2; k<3; k++)
+	{
+		tid_top[k+2] = tididx + k + disc.base;
+		tid_bottom[k+2] = tididx + k;
+	}
+
+    //Assign the initial values to the first row in temper, each block
+    //has it's own version of temper shared among its threads.
+	temper[tididx] = IC[gid];
+
+	__syncthreads();
+
+	if (threadIdx.x > 1 && threadIdx.x <(blockDim.x-2))
+	{
+		temper[tid_top[2]] = stutterStep(temper, tid_bottom);
+	}
+
+	__syncthreads();
+
+	for (int k = 4; k<(blockDim.x/2); k+=4)
+	{
+		if (threadIdx.x < (blockDim.x-k) && threadIdx.x >= k)
+		{
+			temper[tididx] += finalStep(temper, tid_top);
+		}
+
+		step2 = k + 2;
+		__syncthreads();
+
+		if (threadIdx.x < (blockDim.x-step2) && threadIdx.x >= step2)
+		{
+			temper[tid_top[2]] = stutterStep(temper, tid_bottom);
+		}
+
+		//Make sure the threads are synced
+		__syncthreads();
+
+	}
+
+	writeOutRight(temper, outRight, outLeft, threadIdx.x, gid, blockDim.x);
+
+}
+
+__global__
+void
+downTriangle(double *IC, const double *inRight, const double *inLeft)
+{
+	extern __shared__ double temper[];
+
+	int gid = blockDim.x * blockIdx.x + threadIdx.x;
+	int tididx = threadIdx.x+2;
+
+	int step2;
+
+	int tid_top[5], tid_bottom[5];
+	#pragma unroll
+	for (int k = -2; k<3; k++)
+	{
+		tid_top[k+2] = tididx + k + disc.base;
+		tid_bottom[k+2] = tididx + k;
+	}
+
+	readIn(temper, inRight, inLeft, threadIdx.x, gid);
+
+	__syncthreads();
+
+	for (int k = (disc.ht-2); k>0; k-=4)
+	{
+		if (tididx < (disc.base-k) && tididx >= k)
+		{
+			temper[tid_top[2]] = stutterStep(temper, tid_bottom);
+		}
+
+		step2 = k-2;
+		__syncthreads();
+
+		if (tididx < (disc.base-step2) && tididx >= step2)
+		{
+			temper[tididx] += finalStep(temper, tid_top);
+		}
+
+		//Make sure the threads are synced
+		__syncthreads();
+	}
+
+    IC[gid] = temper[tididx];
+}
+
+template <bool SPLIT>
+__global__ void
+wholeDiamond(double *inRight, double *inLeft, double *outRight, double *outLeft)
+{
+	extern __shared__ double temper[];
+
+	int gid = blockDim.x * blockIdx.x + threadIdx.x;
+	int tididx = threadIdx.x + 2;
+
+	int step2;
+
+	int tid_top[5], tid_bottom[5];
+	#pragma unroll
+	for (int k = -2; k<3; k++)
+	{
+		tid_top[k+2] = tididx + k + disc.base;
+		tid_bottom[k+2] = tididx + k;
+	}
+
+	readIn(temper, inRight, inLeft, threadIdx.x, gid);
+
+	__syncthreads();
+
+	for (int k = (disc.ht-2); k>0; k-=4)
+	{
+		if (tididx < (disc.base-k) && tididx >= k)
+		{
+			temper[tid_top[2]] = stutterStep(temper, tid_bottom);
+		}
+
+		step2 = k-2;
+		__syncthreads();
+
+		if (tididx < (disc.base-step2) && tididx >= step2)
+		{
+			temper[tididx] += finalStep(temper, tid_top);
+		}
+
+		//Make sure the threads are synced
+		__syncthreads();
+	}
+
+    //-------------------TOP PART------------------------------------------
+
+	if (threadIdx.x > 1 && threadIdx.x <(blockDim.x-2))
+	{
+		temper[tid_top[2]] = stutterStep(temper, tid_bottom);
+	}
+
+	__syncthreads();
+
+	//The initial conditions are timslice 0 so start k at 1.
+	for (int k = 4; k<(blockDim.x/2); k+=4)
+	{
+		if (threadIdx.x < (blockDim.x-k) && threadIdx.x >= k)
+		{
+			temper[tididx] += finalStep(temper, tid_top);
+		}
+
+		step2 = k+2;
+		__syncthreads();
+
+		if (threadIdx.x < (blockDim.x-step2) && threadIdx.x >= step2)
+		{
+			temper[tid_top[2]] = stutterStep(temper, tid_bottom);
+		}
+
+		//Make sure the threads are synced
+		__syncthreads();
+
+	}
+
+	//After the triangle has been computed, the right and left shared arrays are
+	//stored in global memory by the global thread ID since (conveniently),
+	//they're the same size as a warp!
+	if (SPLIT)
+	{
+		writeOutLeft(temper, outRight, outLeft, threadIdx.x, gid, blockDim.x);
+	}
+	else
+	{
+		writeOutRight(temper, outRight, outLeft, threadIdx.x, gid, blockDim.x);
+	}
+}
+
+double
+classicWrapper(double *aState)
+{
+	std::cout << " -- Classic Array -- " << std::endl;
+
+	const int aSize = sizeof(double) * rv.dv;
+	int tstep = 0;
+	// I'm just going to malloc in here.
+
+    double *dks_in, *dks_out;
+
+    cudaMalloc((void **)&dks_in,  aSize);
+    cudaMalloc((void **)&dks_out, aSize);
+
+    // Copy the initial conditions to the device array.
+    cudaMemcpy(dks_in, aState, aSize, cudaMemcpyHostToDevice);
+    double t_eq = 0.0;
+
+    while (t_eq <= rv.tf)
+    {
+        classicKS <false> <<< rv.bks, rv.tpb >>> (dks_in, dks_out);
+        classicKS <true>  <<< rv.bks, rv.tpb >>> (dks_out, dks_in);
+		tstep += 4;
+        t_eq += rv.dt;
+		
+    }
+
+    cudaMemcpy(aState, dks_in, aSize, cudaMemcpyDeviceToHost);
+	std::cout << tstep << std::endl;
+
+    cudaFree(dks_in);
+    cudaFree(dks_out);
+
+    return t_eq;
+}
+
+
+//The host routine.
+double
+sweptWrapper(double *aState)
+{
+	std::cout << " -- Swept Array -- " << std::endl;
+	
+	const size_t szSt = sizeof(double); 
+    const size_t smem = (2*rv.tpb+8)*szSt;
+    
+    std::cout << "SHARED MEMORY: " << smem << std::endl;
+
+	double *d_IC, *d0_right, *d0_left, *d2_right, *d2_left;
+	const int aSize = sizeof(double) * rv.dv;
+
+	cudaMalloc((void **)&d_IC, 		aSize);
+	cudaMalloc((void **)&d0_right, 	aSize);
+	cudaMalloc((void **)&d0_left, 	aSize);
+	cudaMalloc((void **)&d2_right, 	aSize);
+	cudaMalloc((void **)&d2_left, 	aSize);
+
+	// Copy the initial conditions to the device array.
+	cudaMemcpy(d_IC, aState, aSize, cudaMemcpyHostToDevice);
+	//Start the counter and start the clock.
+
+	//Every other step is a full timestep and each cycle is half tpb steps.
+	double t_fullstep = 0.25 * rv.dt * (double)rv.tpb;
+	double t_eq;
+
+	t_eq = t_fullstep;
+
+	upTriangle <<< rv.bks, rv.tpb, smem >>> (d_IC, d0_right, d0_left);
+
+	//Split
+	wholeDiamond <true> <<< rv.bks, rv.tpb, smem >>> (d0_right, d0_left, d2_right, d2_left);
+
+	// Call the kernels until you reach the iteration limit.
+	while(t_eq < rv.tf)
+	{
+		wholeDiamond <false> <<< rv.bks, rv.tpb, smem >>> (d2_right, d2_left, d0_right, d0_left);
+
+		//Split
+		wholeDiamond <true> <<< rv.bks, rv.tpb, smem >>> (d0_right, d0_left, d2_right, d2_left);
+
+		t_eq += t_fullstep;
+	}
+
+	downTriangle <<< rv.bks, rv.tpb, smem >>> (d_IC, d2_right, d2_left);
+
+	cudaMemcpy(aState, d_IC, aSize, cudaMemcpyDeviceToHost);
+
+	cudaFree(d_IC);
+	cudaFree(d0_right);
+	cudaFree(d0_left);
+	cudaFree(d2_right);
+	cudaFree(d2_left);
+	std::cout << t_eq/rv.dt << std::endl;
+
+	return t_eq;
+}
+
diff --git a/investigations/flatLongTest/KSAtomic.hpp b/investigations/flatLongTest/KSAtomic.hpp
new file mode 100644
index 0000000..825f021
--- /dev/null
+++ b/investigations/flatLongTest/KSAtomic.hpp
@@ -0,0 +1,232 @@
+
+
+
+struct states {
+   double u[2]; // Full Step, Midpoint step state variables
+   double uxx;
+};
+
+__device__ __forceinline__
+double sumu(states ust, const int ix) { return ust.u[ix] + ust.uxx; }
+
+
+__device__
+void ksStep(states *state, const int idx[3], const int tstep)
+{
+    const int itx = (tstep ^ 1);
+    state[idx[1]].u[tstep] = state[idx[1]].u[0] - (HALF * disc.dt * (itx+1)) *
+                        (convect(state[idx[0]].u[itx], state[idx[2]].u[itx]) +
+                        secondDer(sumu(state[idx[0]], itx), sumu(state[idx[2]], itx), sumu(state[idx[1]], itx)));
+}
+
+__device__
+void stepUpdate(states *state, const int idx[3], const int tstep)
+{
+    const int ts = DIVMOD(tstep);
+    if (tstep & 1) //Odd - Rslt is 0 for even numbers
+    {
+        state[idx[1]].uxx = secondDer(state[idx[0]].u[ts],
+                                    state[idx[2]].u[ts],
+                                    state[idx[1]].u[ts]);
+    }
+    else
+    {
+        ksStep(state, idx, ts);
+    }
+}
+
+
+__global__ void classicStep(states *state, const int ts)
+{
+	auto gidz = idxes<3>();
+
+    stepUpdate(state, gidz.a, ts);
+}
+
+__global__ void upTriangle(states *state, const int tstep)
+{
+	extern __shared__ states sharedState[];
+
+	int gid = blockDim.x * blockIdx.x + threadIdx.x; //Global Thread ID
+    int mid = blockDim.x >> 1;
+    const int tidx[3] = {threadIdx.x-1, threadIdx.x, threadIdx.x+1};
+
+    // Using tidx as tid is kind of confusing for reader but looks valid.
+
+	sharedState[tidx[1]] = state[gid];
+
+    __syncthreads();
+
+	for (int k=1; k<mid; k++)
+	{
+		if (tidx[1] < (blockDim.x-k) && tidx[1] >= k)
+		{
+            stepUpdate(sharedState, tidx, tstep + k);
+		}
+		__syncthreads();
+	}
+    state[gid] = sharedState[tidx[1]];
+}
+
+
+__global__
+void
+downTriangle(states *state, const int tstep)
+{
+	extern __shared__ states sharedState[];
+
+    const int tid = threadIdx.x; // Thread index
+    const int mid = blockDim.x >> 1; // Half of block size
+    const int base = blockDim.x + 2;
+    const int gid = blockDim.x * blockIdx.x + tid;
+    const int tidx[3] = {tid, tid + 1, tid + 2};
+
+    int tnow = tstep; // read tstep into register.
+
+    if (tid<2) sharedState[tid] = state[disc.modg(gid-1)];
+    sharedState[tid+2] = state[disc.modg(gid+1)];
+    __syncthreads();
+
+	for (int k=mid; k>0; k--)
+	{
+		if (tidx[1] < (base-k) && tidx[1] >= k)
+		{
+	    	stepUpdate(sharedState, tidx, tnow);
+		}
+		tnow++;
+		__syncthreads();
+	}
+    state[gid] = sharedState[tidx[1]];
+}
+
+template <bool SPLIT>
+__global__ void
+wholeDiamond(states *state, const int tstep)
+{
+	extern __shared__ states sharedState[];
+
+    const int mid = (blockDim.x >> 1); // Half of block size
+    const int base = blockDim.x + 2;
+    const int gid = blockDim.x * blockIdx.x + threadIdx.x + SPLIT*(disc.ht-2);
+    const int tidx[3] = {threadIdx.x, threadIdx.x + 1, threadIdx.x + 2};
+
+    int tnow = tstep;
+	int k;
+
+    if (threadIdx.x<2) sharedState[threadIdx.x] = state[disc.modg(gid-1)];
+    sharedState[tidx[2]] = state[disc.modg(gid+1)];
+
+    __syncthreads();
+
+	for (k=mid; k>0; k--)
+	{
+		if (tidx[1] < (base-k) && tidx[1] >= k)
+		{
+        	stepUpdate(sharedState, tidx, tnow);
+		}
+		tnow++;
+		__syncthreads();
+	}
+
+	for (k=2; k<=mid; k++)
+	{
+		if (tidx[1] < (base-k) && tidx[1] >= k)
+		{
+            stepUpdate(sharedState, tidx, tnow);
+		}
+		tnow++;
+		__syncthreads();
+    }
+
+    state[disc.modg(gid)] = sharedState[tidx[1]];
+}
+
+
+
+double classicWrapper(states *aState)
+{
+    std::cout << " -- Classic Atom -- " << std::endl;
+    const size_t szSt = sizeof(states); 
+    int tstep = 1;
+    const int npass = rv.dv * szSt;
+
+    states *dState;
+
+    cudaMalloc((void **) &dState, npass);
+
+    // Copy the initial conditions to the device array.
+    cudaMemcpy(dState, aState, npass, cudaMemcpyHostToDevice);
+
+    double t_eq = 0.0;
+
+    while (t_eq <= rv.tf)
+    {
+        classicStep <<< rv.bks, rv.tpb >>> (dState, tstep);
+        tstep++;
+        if (tstep%NSTEPS == 0) t_eq += rv.dt;
+    }
+
+    cudaMemcpy(aState, dState, npass, cudaMemcpyDeviceToHost);
+    std::cout << tstep << std::endl;
+    cudaFree(dState);
+
+    return t_eq;
+}
+
+//The host routine.
+double
+sweptWrapper(states *aState)
+{
+    std::cout << " -- Swept Atom -- " << std::endl;
+    const size_t szSt = sizeof(states); 
+    const size_t smem = (rv.tpb+2)*szSt;
+    
+    std::cout << "SHARED MEMORY: " << smem << std::endl;
+    int tstep = 1;
+	states *dState;
+    const int npass = rv.dv * szSt;
+
+	cudaMalloc((void **)&dState, npass);
+
+	// Copy the initial conditions to the device array.
+	cudaMemcpy(dState, aState, npass, cudaMemcpyHostToDevice);
+	//Start the counter and start the clock.
+
+	//Every other step is a full timestep and each cycle is half tpb steps.
+    
+    int stepi = rv.tpb/2;
+	double t_fullstep = 0.25 * rv.dt * (double)rv.tpb;
+	double t_eq;
+
+	t_eq = t_fullstep;
+
+	upTriangle <<< rv.bks, rv.tpb, smem >>> (dState, tstep);
+
+	//Split
+	wholeDiamond <true> <<< rv.bks, rv.tpb, smem >>> (dState, tstep);
+
+    tstep += stepi;
+
+	// Call the kernels until you reach the iteration limit.
+	while(t_eq < rv.tf)
+	{
+		wholeDiamond <false> <<< rv.bks, rv.tpb, smem >>> (dState, tstep);
+
+        tstep += stepi;
+		
+		wholeDiamond <true> <<< rv.bks, rv.tpb, smem >>> (dState, tstep);
+
+		t_eq  += t_fullstep;
+        tstep += stepi;
+	}
+
+	downTriangle <<< rv.bks, rv.tpb, smem >>>(dState, tstep);
+
+    tstep += stepi;
+
+	cudaMemcpy(aState, dState, npass, cudaMemcpyDeviceToHost);
+
+	cudaFree(dState);
+    std::cout << tstep-1 << " -- " << t_eq/rv.dt << std::endl;
+	return t_eq;
+}
diff --git a/investigations/flatLongTest/KSAtomicPad.hpp b/investigations/flatLongTest/KSAtomicPad.hpp
new file mode 100644
index 0000000..e32b5be
--- /dev/null
+++ b/investigations/flatLongTest/KSAtomicPad.hpp
@@ -0,0 +1,255 @@
+
+#include <algorithm>
+
+struct states {
+   double u[2]; // Full Step, Midpoint step state variables
+   double uxx;
+   int tstep;
+};
+
+__device__ __forceinline__
+double sumu(states ust, const int ix) { return ust.u[ix] + ust.uxx; }
+
+
+__device__
+void ksStep(states *state, const int idx[3], const int tstep)
+{
+    const int itx = (tstep ^ 1);
+    state[idx[1]].u[tstep] = state[idx[1]].u[0] - (HALF * disc.dt * (itx+1)) *
+                        (convect(state[idx[0]].u[itx], state[idx[2]].u[itx]) +
+                        secondDer(sumu(state[idx[0]], itx), sumu(state[idx[2]], itx), sumu(state[idx[1]], itx)));
+}
+
+__device__
+void stepUpdate(states *state, const int idx[3])
+{
+    const int ts = DIVMOD(state[idx[1]].tstep);
+    if (state[idx[1]].tstep & 1) //Odd - Rslt is 0 for even numbers
+    {
+        state[idx[1]].uxx = secondDer(state[idx[0]].u[ts],
+                                    state[idx[2]].u[ts],
+                                    state[idx[1]].u[ts]);
+    }
+    else
+    {
+        ksStep(state, idx, ts);
+    }
+    state[idx[1]].tstep++;
+}
+
+
+__global__ void classicStep(states *state)
+{
+	auto gidz = idxes<3>();
+
+    stepUpdate(state, gidz.a);
+}
+
+__global__ void upTriangle(states *state)
+{
+	extern __shared__ states sharedState[];
+
+	int gid = blockDim.x * blockIdx.x + threadIdx.x; //Global Thread ID
+    int mid = blockDim.x >> 1;
+    const int tidx[3] = {threadIdx.x-1, threadIdx.x, threadIdx.x+1};
+
+    // Using tidx as tid is kind of confusing for reader but looks valid.
+
+	sharedState[tidx[1]] = state[gid];
+
+    __syncthreads();
+
+	for (int k=1; k<mid; k++)
+	{
+		if (tidx[1] < (blockDim.x-k) && tidx[1] >= k)
+		{
+            stepUpdate(sharedState, tidx);
+		}
+		__syncthreads();
+	}
+    state[gid] = sharedState[tidx[1]];
+}
+
+
+__global__
+void
+downTriangle(states *state)
+{
+	extern __shared__ states sharedState[];
+
+    const int tid = threadIdx.x; // Thread index
+    const int mid = blockDim.x >> 1; // Half of block size
+    const int base = blockDim.x + 2;
+    const int gid = blockDim.x * blockIdx.x + tid;
+    const int tidx[3] = {tid, tid + 1, tid + 2};
+
+
+    if (tid<2) sharedState[tid] = state[disc.modg(gid-1)];
+    sharedState[tid+2] = state[disc.modg(gid+1)];
+    __syncthreads();
+
+	for (int k=mid; k>0; k--)
+	{
+		if (tidx[1] < (base-k) && tidx[1] >= k)
+		{
+	    	stepUpdate(sharedState, tidx);
+		}
+		__syncthreads();
+	}
+    state[gid] = sharedState[tidx[1]];
+}
+
+template <bool SPLIT>
+__global__ void
+wholeDiamond(states *state)
+{
+	extern __shared__ states sharedState[];
+
+    const int mid = (blockDim.x >> 1); // Half of block size
+    const int base = blockDim.x + 2;
+    const int gid = blockDim.x * blockIdx.x + threadIdx.x + SPLIT*(disc.ht-2);
+    const int tidx[3] = {threadIdx.x, threadIdx.x + 1, threadIdx.x + 2};
+
+	int k;
+
+    if (threadIdx.x<2) sharedState[threadIdx.x] = state[disc.modg(gid-1)];
+    sharedState[tidx[2]] = state[disc.modg(gid+1)];
+
+    __syncthreads();
+
+	for (k=mid; k>0; k--)
+	{
+		if (tidx[1] < (base-k) && tidx[1] >= k)
+		{
+        	stepUpdate(sharedState, tidx);
+		}
+	
+		__syncthreads();
+	}
+
+	for (k=2; k<=mid; k++)
+	{
+		if (tidx[1] < (base-k) && tidx[1] >= k)
+		{
+            stepUpdate(sharedState, tidx);
+		}
+	
+		__syncthreads();
+    }
+
+    state[disc.modg(gid)] = sharedState[tidx[1]];
+}
+
+size_t szSt = sizeof(states);
+std::vector<int> qtest(states *hSt)
+{
+    std::vector<int> vi;
+    for (int i=0; i<rv.dv; i++)  vi.push_back(hSt[i].tstep);
+    
+    auto io = std::unique(vi.begin(), vi.end());
+
+    vi.erase(io, vi.end()); 
+    return vi;
+
+}
+
+double classicWrapper(states *aState)
+{
+    std::cout << " -- Classic Atom -- " << std::endl;
+    int tstep = 1;
+    const int npass = rv.dv * szSt;
+
+    states *dState;
+
+    cudaMalloc((void **) &dState, npass);
+
+    // Copy the initial conditions to the device array.
+    cudaMemcpy(dState, aState, npass, cudaMemcpyHostToDevice);
+
+    double t_eq = 0.0;
+
+
+    while (t_eq <= rv.tf)
+    {
+        classicStep <<< rv.bks, rv.tpb >>> (dState);
+        tstep++;
+        if (tstep%NSTEPS == 0) t_eq += rv.dt;
+    }
+
+    cudaMemcpy(aState, dState, npass, cudaMemcpyDeviceToHost);
+    std::cout << tstep << std::endl;
+    cudaFree(dState);
+
+    std::vector<int> stepss = qtest(aState);
+    std::cout << "UNIQUES: ";
+    for (int i : stepss)
+        std::cout << i << " ";
+    
+    std::cout << std::endl;
+
+    return t_eq;
+}
+
+//The host routine.
+double
+sweptWrapper(states *aState)
+{
+    std::cout << " -- Swept Atom -- " << std::endl;
+    const size_t szSt = sizeof(states); 
+    const size_t smem = (rv.tpb+2)*szSt;
+    
+    std::cout << "SHARED MEMORY: " << smem << std::endl;
+    int tstep = 1;
+	states *dState;
+    const int npass = rv.dv * szSt;
+
+	cudaMalloc((void **)&dState, npass);
+
+	// Copy the initial conditions to the device array.
+	cudaMemcpy(dState, aState, npass, cudaMemcpyHostToDevice);
+	//Start the counter and start the clock.
+
+	//Every other step is a full timestep and each cycle is half tpb steps.
+    
+    int stepi = rv.tpb/2;
+	double t_fullstep = 0.25 * rv.dt * (double)rv.tpb;
+	double t_eq;
+
+	t_eq = t_fullstep;
+
+	upTriangle <<< rv.bks, rv.tpb, smem >>> (dState);
+
+	//Split
+	wholeDiamond <true> <<< rv.bks, rv.tpb, smem >>> (dState);
+
+    tstep += stepi;
+
+
+	// Call the kernels until you reach the iteration limit.
+	while(t_eq < rv.tf)
+	{
+		wholeDiamond <false> <<< rv.bks, rv.tpb, smem >>> (dState);
+
+        tstep += stepi;
+		
+		wholeDiamond <true> <<< rv.bks, rv.tpb, smem >>> (dState);
+
+		t_eq  += t_fullstep;
+        tstep += stepi;
+	}
+
+	downTriangle <<< rv.bks, rv.tpb, smem >>>(dState);
+
+	cudaMemcpy(aState, dState, npass, cudaMemcpyDeviceToHost);
+
+    cudaFree(dState);
+    
+    std::vector<int> stepss = qtest(aState);
+    std::cout << "UNIQUES: ";
+    for (int i : stepss)
+        std::cout << i << " ";
+    
+    std::cout << aState[0].tstep << " " << aState[50].tstep << " " << aState[0].tstep << " " << tstep << std::endl;
+
+	return t_eq;
+}
diff --git a/investigations/flatLongTest/KSmain.cu b/investigations/flatLongTest/KSmain.cu
new file mode 100644
index 0000000..cc3d18c
--- /dev/null
+++ b/investigations/flatLongTest/KSmain.cu
@@ -0,0 +1,375 @@
+/**
+    Copyright (C) 2017 Kyle Niemeyer, niemeyek@oregonstate.edu AND
+    Daniel Magee, mageed@oregonstate.edu
+*/
+/*
+    This file is distribued under the MIT License.  See LICENSE at top level of directory or: <https://opensource.org/licenses/MIT>.
+*/
+
+//COMPILE LINE!
+// nvcc -o ./bin/KSOut KS1D_SweptShared.cu -gencode arch=compute_35,code=sm_35 -lm -restrict -Xcompiler -fopenmp --ptxas-options=-v
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <cuda_runtime.h>
+#include <device_functions.h>
+
+#include <iostream>
+#include <ostream>
+#include <algorithm>
+#include <iterator>
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+#include <fstream>
+#include <string>
+#include "cudaUtils.h"
+#include <vector>
+
+// #include "matplotlib-cpp/matplotlibcpp.h"
+
+// namespace plt = matplotlibcpp;
+
+#ifndef GPUNUM
+    #define GPUNUM              0
+#endif
+
+#define QUARTER     0.25
+#define HALF        0.5
+#define ONE         1.0
+#define TWO         2.0
+#define FOUR        4.0
+#define SIX			6.0
+
+#define NSTEPS              4 // How many substeps in timestep.
+
+// Since anyone would need to write a header and functions file, why not just hardwire this.
+// If the user's number of steps isn't a power of 2 use the other one.
+
+#define MODULA(x)           x & (NSTEPS-1)
+// #define MODULA(x)           x % NSTEPS
+
+#define DIVMOD(x)           (MODULA(x)) >> 1
+
+
+__host__ __device__
+__forceinline__
+int mod(int a, int b) { return (a % b + b) % b; }
+
+const double dx = 0.5;
+
+struct discConstants{
+
+	double dxTimes4;
+	double dx2;
+	double dx4;
+	double dt;
+	double dt_half;
+	int base;
+	int ht;
+    int idxend;
+    int nPts;
+    __host__ __device__
+    __forceinline__
+    int modg(int gd) { return mod(gd, nPts);}
+};
+
+__constant__ discConstants disc;
+
+template <int n>
+struct ar{
+    int a[n];
+};
+
+template <int n>
+__device__
+ar<n> idxes()
+{
+    const int gid = blockDim.x * blockIdx.x + threadIdx.x;
+    const int st = n/2;
+    ar <n> ii;
+    #pragma unroll
+    for (int k=0; k<n; k++) ii.a[k] = disc.modg((gid-st) + k);
+	return ii;
+}
+
+//Initial condition.
+__host__
+double initFun(double xnode)
+{
+	return TWO * cos(19.0*xnode*M_PI/128.0);
+}
+
+__device__ __forceinline__
+double fourthDer(double tfarLeft, double tLeft, double tCenter, double tRight, double tfarRight)
+{
+	return (tfarLeft - FOUR*tLeft + SIX*tCenter - FOUR*tRight + tfarRight)*(disc.dx4);
+}
+
+__device__ __forceinline__
+double secondDer(double tLeft, double tRight, double tCenter)
+{
+	return (tLeft + tRight - TWO*tCenter)*(disc.dx2);
+}
+
+__device__ __forceinline__
+double convect(double tLeft, double tRight)
+{
+	return (tRight*tRight - tLeft*tLeft)*(disc.dxTimes4);
+}
+
+struct runVar{
+    int bks, tpb, dv;
+    double dt, tf, lx;
+    void printme()
+    {
+        std::cout << "KS ---" << std::endl;
+        std::cout << "#Blocks: " << bks << " | Length: " << lx << " | dt/dx: " << dt/dx << std::endl;
+        std::cout << "TPB: " << tpb << " Grid Size " << dv << std::endl;
+        std::cout << "--------------------" << std::endl;
+    }
+} rv;
+
+#include "KSArray.hpp"
+#ifdef PAD
+    #include "KSAtomicPad.hpp"
+#else
+    #include "KSAtomic.hpp"
+#endif
+
+std::vector<double> tests(double *d1, states *s2, int dv, double tl)
+{
+	int t = 0;
+	double absit;
+    double summit   = 0;
+    double sumd     = 0;
+    std::vector <double> sto;
+
+	for(int k=0; k<dv; k++)
+	{
+		absit   = (std::abs(d1[k] - s2[k].u[0]));
+		t      += (absit > tl);
+        summit += absit;
+        sumd   += std::abs(d1[k]);
+        sto.push_back(absit);
+	}
+
+	if (!t)
+	{
+		std::cout << "EQUAL! " << summit << " OUT OF " << sumd << std::endl;
+	}
+	else
+	{
+		std::cout << "NOT EQUAL! " << summit << " OUT OF " << sumd << std::endl;
+		std::cout << t << " out of " << dv << std::endl;
+    }
+    return sto;
+}
+
+std::vector<double> tests(double *d1, double *s2, int dv, double tl)
+{
+	int t = 0;
+	double absit;
+    double summit = 0;
+    double sumd = 0;
+    std::vector <double> sto;
+
+	for(int k=0; k<dv; k++)
+	{
+		absit   = (std::abs(d1[k] - s2[k]));
+		t      += (absit > tl);
+        summit += absit;
+        sumd   += std::abs(d1[k]);
+        sto.push_back(absit);
+	}
+
+	if (!t)
+	{
+		std::cout << "EQUAL! " << summit << " OUT-OF " << sumd << std::endl;
+	}
+	else
+	{
+		std::cout << "NOT EQUAL! " << summit << " OUT-OF " << sumd << std::endl;
+		std::cout << t << " out of " << dv << std::endl;
+    }
+    return sto;
+}
+
+inline void writeTime(const cudaTime &Timer)
+{
+    if (Timer.nTimes > 10) return;
+
+    std::ofstream timeOut;
+    timeOut.open("atomicArray.csv", std::ios::app);
+    timeOut.seekp(0, std::ios::end);
+    bool starts = (timeOut.tellp() == 0);
+    if (starts) timeOut << "tpb,dv,sweptArray,sweptAtomic,classicArray,classicAtomic\n";
+
+    timeOut << rv.tpb << "," << rv.dv;
+    for (int i=0; i<Timer.nTimes; i++) timeOut << "," << Timer.times[i];
+    timeOut << std::endl;
+    timeOut.close();
+}
+
+
+int main( int argc, char *argv[])
+{
+
+	cudaSetDevice(GPUNUM);
+	cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
+    rv.dv = atoi(argv[1]); //Number of spatial points
+	rv.tpb = atoi(argv[2]); //Threads per Block
+    rv.dt = atof(argv[3]); //delta T timestep
+	rv.tf = atof(argv[4]) - 0.25*rv.dt; //Finish time
+    rv.bks = rv.dv/rv.tpb; //The number of blocks
+    rv.lx = rv.dv*dx;
+
+    rv.printme();
+
+    discConstants dsc = {
+		ONE/(FOUR*dx), //dx
+		ONE/(dx*dx), //dx^2
+		ONE/(dx*dx*dx*dx), //dx^4
+		rv.dt, //dt
+		rv.dt*0.5, //dt half
+		rv.tpb + 4, //length of row of shared array
+		(rv.tpb+4)/2, //midpoint of shared array row
+		rv.dv-1, //last global thread id
+        rv.dv
+    };
+
+    std::vector<double> vatom;
+    std::vector<double> varray;
+    std::vector<double> xa;
+
+    cudaMemcpyToSymbol(disc,&dsc,sizeof(dsc));
+
+    cudaTime *timer = new cudaTime();
+    double *arrayState, *permState, *tradeState;
+    states *atomicState;
+    double tfm;
+    double tol=1.0e-3;
+
+    const int arSize = sizeof(double) * rv.dv;
+    const int atomSize = sizeof(states) * rv.dv;
+
+    cudaHostAlloc((void **) &permState, arSize, cudaHostAllocDefault);
+    cudaHostAlloc((void **) &arrayState, arSize, cudaHostAllocDefault);
+    cudaHostAlloc((void **) &atomicState, atomSize, cudaHostAllocDefault);
+    cudaHostAlloc((void **) &tradeState, arSize, cudaHostAllocDefault);
+
+    double idou, nTimes;
+    for (int k=0; k<rv.dv; k++)
+    {
+        xa.push_back(k*dx);
+        idou                = initFun((double)k*dx);
+        permState[k]        = idou;
+        arrayState[k]       = idou;
+        atomicState[k].u[0] = idou;
+    }
+
+    tests(arrayState, atomicState, rv.dv, tol);
+
+    //CLASSIC TEST
+    std::cout << " SWEPT TEST " << std::endl;
+
+    timer->tinit();
+    tfm = sweptWrapper(arrayState);
+    nTimes = tfm/rv.dt;
+    std::cout << tfm << "  " << nTimes << std::endl;
+    timer->tfinal(nTimes);
+
+    cudaKernelCheck();
+
+    timer->tinit();
+    tfm = sweptWrapper(atomicState);
+    nTimes = tfm/rv.dt;
+    std::cout << tfm << "  " << nTimes << std::endl;
+    timer->tfinal(nTimes);
+
+    cudaKernelCheck();
+
+    std::vector <double> ipon = tests(arrayState, atomicState, rv.dv, tol);
+
+    rv.tf = tfm-(0.25*rv.dt); // RESET IT
+
+    // plt::named_plot("Differences", xa, ipon, "-r");
+    // plt::legend();
+    // plt::show();
+
+    std::cout << *timer << std::endl;
+
+    std::cout << " ----------------------- " << std::endl;
+    std::cout << " CLASSIC TEST " << std::endl;
+
+    for (int k=0; k<rv.dv; k++) 
+    {
+        varray.push_back(arrayState[k]);
+        vatom.push_back(atomicState[k].u[0]);
+        tradeState[k]        = arrayState[k];
+        arrayState[k]       = permState[k];
+    }
+
+    // plt::named_plot("Atomic", xa, vatom, "-r");
+    // plt::named_plot("Array", xa, varray, "-b");
+    // plt::title("Classic");
+    // plt::legend();
+    // plt::show();
+    
+    timer->tinit();
+    tfm = classicWrapper(arrayState);
+    nTimes = tfm/rv.dt;
+    std::cout << tfm << "  " << nTimes << std::endl;
+    timer->tfinal(nTimes);
+
+    cudaKernelCheck();
+
+    std::cout << " Swept vs Classic ARRAY Agreement: "; 
+    tests(tradeState, arrayState, rv.dv, tol);
+
+    for (int k=0; k<rv.dv; k++) 
+    {
+        tradeState[k]        = atomicState[k].u[0];
+        atomicState[k].u[0]  = permState[k];
+    }
+    
+    timer->tinit();
+    tfm = classicWrapper(atomicState);
+    nTimes = tfm/rv.dt;
+    std::cout << tfm << "  " << nTimes << std::endl;
+    timer->tfinal(nTimes);
+
+    cudaKernelCheck();
+    std::cout << " Swept vs Classic ATOMIC Agreement: "; 
+    tests(tradeState, atomicState, rv.dv, tol);
+
+    ipon = tests(arrayState, atomicState, rv.dv, tol);
+    // plt::named_plot("Differences", xa, ipon, "-r");
+    // plt::legend();
+    // plt::show();
+
+    std::cout << *timer << std::endl;
+    for (int k = 0; k<rv.dv; k++)
+    {
+        varray[k] = arrayState[k];
+        vatom[k]  = atomicState[k].u[0];
+    }
+
+    if (argc>5) writeTime(*timer);
+
+    // plt::named_plot("Atomic", xa, vatom, "-r");
+    // plt::named_plot("Array", xa, varray, "-b");
+    // plt::title("Swept");
+    // plt::legend();
+    // plt::show();
+
+    std::cout << " ----------------------- " << std::endl;
+
+    cudaFreeHost(arrayState);
+    cudaFreeHost(atomicState);
+    cudaFreeHost(permState);
+
+    delete timer;
+
+    return 0;
+}
\ No newline at end of file
diff --git a/investigations/flatLongTest/Makefile b/investigations/flatLongTest/Makefile
new file mode 100644
index 0000000..b2a2cd9
--- /dev/null
+++ b/investigations/flatLongTest/Makefile
@@ -0,0 +1,27 @@
+
+
+CUDAFLAGS :=  -gencode arch=compute_35,code=sm_35 -restrict -O3 -lm -std=c++11 -w --ptxas-options=-v
+
+ifndef NOPLOT
+	PLOTSS=-I/usr/include/python2.7 -lpython2.7
+endif
+
+ifdef PD
+	EXTO:=PAD
+else
+	PD=
+	EXTO:=
+endif
+
+# It will put the compiled programs in the bin subdirectory
+PROJECT_DIR = $(shell pwd)
+OUT_DIR := ./bin
+#$(PROJECT_DIR)
+
+default: $(OUT_DIR)/KSmain$(EXTO)
+
+$(OUT_DIR)/%$(EXTO): %.cu $(PROJECT_DIR)/*.hpp
+	nvcc $< -o $@ $(CUDAFLAGS) $(PLOTSS) $(PD)
+
+clean:
+	rm bin/*
diff --git a/investigations/flatLongTest/atomicArray.csv b/investigations/flatLongTest/atomicArray.csv
new file mode 100644
index 0000000..38325e4
--- /dev/null
+++ b/investigations/flatLongTest/atomicArray.csv
@@ -0,0 +1,61 @@
+tpb,dv,sweptArray,sweptAtomic,classicArray,classicAtomic
+32,2048,2.40007,6.63434,10.4235,29.6741
+32,4096,2.6025,7.51959,12.404,32.9315
+32,8192,3.67676,12.7995,13.6601,47.2752
+32,16384,5.08457,19.3627,16.5338,66.6656
+32,32768,8.15502,33.3647,23.7059,110.6
+32,65536,14.7692,62.9635,37.4083,210.004
+32,131072,27.6137,119.673,71.1649,384.872
+32,262144,53.1766,232.851,129.996,737.279
+32,524288,104.275,459.421,247.739,1443.36
+32,1048576,207.71,912.714,483.13,2847.12
+64,2048,1.88515,5.63549,10.5808,29.8034
+64,4096,2.0648,6.41555,12.2258,35.0574
+64,8192,2.48611,9.1848,12.1253,44.8238
+64,16384,3.5951,24.7273,15.6067,65.5351
+64,32768,5.7921,46.2661,20.1162,100.891
+64,65536,10.2214,86.9771,29.9518,179.258
+64,131072,19.0078,171.308,52.3312,330.095
+64,262144,36.4424,339.954,92.4432,629.662
+64,524288,71.3321,677.602,172.641,1228.7
+64,1048576,141.045,1352.59,333.048,2426.57
+128,2048,1.53293,5.15251,10.1668,30.46
+128,4096,1.6771,6.14865,12.2177,37.4525
+128,8192,2.03253,8.64294,12.336,45.7973
+128,16384,2.8478,18.0308,15.5637,64.6168
+128,32768,4.58072,44.6867,20.4183,104.386
+128,65536,7.99725,83.013,29.927,182.722
+128,131072,14.8442,161.414,50.0962,335.065
+128,262144,28.267,321.015,88.2068,637.475
+128,524288,55.1163,638.64,164.32,1242.33
+128,1048576,108.843,1268.33,316.867,2452.8
+256,2048,1.3505,4.95406,11.3153,30.8443
+256,4096,1.57635,7.46392,12.4013,40.9095
+256,8192,1.92905,9.21015,13.8858,49.8651
+256,16384,2.6322,16.0087,15.3831,68.2373
+256,32768,4.22022,41.0684,20.4245,104.76
+256,65536,7.40611,68.8514,30.3287,186.47
+256,131072,13.5593,128.168,50.4761,339.494
+256,262144,25.8775,254.894,88.6498,642.157
+256,524288,50.4286,508.592,165.387,1247.21
+256,1048576,99.5966,1017.17,318.505,2459.41
+512,2048,1.52177,7.71707,12.0663,40.6267
+512,4096,1.52928,7.7432,12.3056,41.3265
+512,8192,2.00342,11.3485,13.6711,58.1932
+512,16384,2.51893,16.2842,17.0839,78.3168
+512,32768,4.5901,34.3451,21.7042,115.247
+512,65536,8.04788,56.7913,30.7422,195.298
+512,131072,14.6278,112.054,51.616,358.37
+512,262144,26.6582,218.602,89.9169,675.46
+512,524288,51.6933,419.75,167.19,1305.18
+512,1048576,101.67,826.456,321.632,2562.83
+1024,2048,1.97449,11.864,14.3608,62.6596
+1024,4096,1.98058,11.9167,14.5522,62.908
+1024,8192,1.99942,11.9681,15.0342,63.3168
+1024,16384,3.04024,24.1762,18.346,99.7006
+1024,32768,4.95251,36.956,23.664,138.721
+1024,65536,8.64458,62.194,33.9697,227.136
+1024,131072,14.7341,113.775,55.9044,395.561
+1024,262144,27.0346,215.657,97.4133,738.591
+1024,524288,52.0674,419.875,179.548,1418.75
+1024,1048576,103.674,828.409,344.129,2786.37
diff --git a/investigations/flatLongTest/atomicPadding.csv b/investigations/flatLongTest/atomicPadding.csv
new file mode 100644
index 0000000..aac4add
--- /dev/null
+++ b/investigations/flatLongTest/atomicPadding.csv
@@ -0,0 +1,61 @@
+tpb,dv,classicArray,classicAtomic,sweptArray,sweptAtomic
+32,2048,10.5145,37.2119,2.35491,10.6148
+32,4096,12.3915,45.3359,2.58066,11.9703
+32,8192,13.5635,67.7075,3.66284,20.8072
+32,16384,16.8177,104.992,5.08569,32.5191
+32,32768,23.6894,182.908,8.12744,56.7365
+32,65536,37.5551,337.811,14.7585,104.285
+32,131072,71.0992,624.913,27.6128,201.835
+32,262144,130.013,1212.95,53.1499,396.345
+32,524288,247.638,2379.28,104.263,784.76
+32,1048576,482.919,4718.92,206.56,1562.93
+64,2048,10.4119,37.267,1.85109,9.61128
+64,4096,12.1452,45.4394,2.0203,10.7847
+64,8192,12.104,62.9761,2.472,14.5321
+64,16384,15.5412,102.015,3.56851,43.8224
+64,32768,20.223,172.222,5.78267,80.5145
+64,65536,29.8674,316.25,10.2022,153.761
+64,131072,52.4047,588.891,19.0093,305.169
+64,262144,92.2957,1136.43,36.4602,610.572
+64,524288,172.54,2230.38,71.3658,1221.9
+64,1048576,332.946,4418.75,141.111,2443.84
+128,2048,10.2113,41.1027,1.52081,10.6074
+128,4096,12.2082,49.9857,1.66086,12.5047
+128,8192,12.3124,65.5395,2.02396,14.9668
+128,16384,15.6147,99.2663,2.84368,29.5843
+128,32768,20.4046,175.152,4.57271,61.0777
+128,65536,29.8627,326.316,8.01774,118.123
+128,131072,49.9871,607.73,14.8674,226.007
+128,262144,88.2125,1159.59,28.2828,445.944
+128,524288,164.271,2260.31,55.136,892.943
+128,1048576,316.65,4457.8,108.906,1784.42
+256,2048,11.3281,41.7556,1.34541,11.2674
+256,4096,12.3938,56.648,1.59529,14.5261
+256,8192,13.7884,72.7568,1.91884,17.2108
+256,16384,15.447,106.5,2.6278,26.6545
+256,32768,20.4602,178.801,4.21093,49.3025
+256,65536,30.1686,329.489,7.3595,98.4544
+256,131072,50.2862,606.564,13.6116,180.673
+256,262144,88.6015,1159.79,25.8546,348.425
+256,524288,165.201,2259.47,50.4757,682.442
+256,1048576,318.283,4456.44,99.6421,1355.42
+512,2048,12.1479,57.5493,1.51909,15.2436
+512,4096,12.1921,57.9564,1.52334,15.3118
+512,8192,13.5092,87.884,2.00571,20.6509
+512,16384,17.1218,121.834,2.60008,36.8136
+512,32768,21.7258,187.141,4.68618,61.4165
+512,65536,30.7689,342.344,7.92118,103.953
+512,131072,51.5981,625.877,14.7362,189.635
+512,262144,89.9209,1175.56,26.6776,364.381
+512,524288,167.145,2275.55,52.309,710.869
+512,1048576,321.477,4482.63,102.133,1410.78
+1024,2048,14.3089,88.6339,1.96935,21.3711
+1024,4096,14.6172,89.0899,1.97433,21.4486
+1024,8192,15.0064,90.181,1.98356,21.5563
+1024,16384,18.3972,155.839,3.12533,41.9673
+1024,32768,23.6427,221.682,4.9535,63.0426
+1024,65536,34.0302,386.226,8.7269,105.694
+1024,131072,55.8035,699.614,15.1361,193.193
+1024,262144,97.2882,1268.49,27.3374,377.109
+1024,524288,179.472,2443.64,52.7626,733.811
+1024,1048576,344.118,4774.14,104.366,1465.54
diff --git a/investigations/flatLongTest/cudaUtils.h b/investigations/flatLongTest/cudaUtils.h
new file mode 100644
index 0000000..26f940b
--- /dev/null
+++ b/investigations/flatLongTest/cudaUtils.h
@@ -0,0 +1,175 @@
+/*
+    UTILITIES FOR HSWEEP.  TIMER and DRIVER AGREEMENT CHECKER.
+*/
+
+#ifndef SWEEPUTIL_H
+#define  SWEEPUTIL_H
+#include <stdio.h>
+#include <iostream>
+#include <string>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <cmath>
+#include <vector>
+#include <array>
+
+// template< typename T >
+// void check(T result, char const *const func, const char *const file, int const line)
+// {
+//     if (result)
+//     {
+//         fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n",
+//                 file, line, static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
+//         DEVICE_RESET
+//         // Make sure we call CUDA Device Reset before exiting
+//         exit(EXIT_FAILURE);
+//     }
+// }
+
+// #define checkCudaErrors(val)           check ( (val), #val, __FILE__, __LINE__ )
+
+void cudaKernelCheck(bool gpuq=true)
+{
+    if (gpuq)
+    {
+        cudaError_t error = cudaGetLastError();
+        if(error != cudaSuccess)
+        {
+            // print the CUDA error message and exit
+            printf("CUDA error: %s\n", cudaGetErrorString(error));
+            exit(-1);
+        }
+        cudaDeviceSynchronize();
+    }
+}
+
+// From cppOverload in CUDASamples.
+#define OUTPUT_ATTR(attr)  \
+    printf("Shared Size:   %d\n", (int)attr.sharedSizeBytes);   \
+    printf("Constant Size: %d\n", (int)attr.constSizeBytes);                 \
+    printf("Local Size:    %d\n", (int)attr.localSizeBytes);                 \
+    printf("Max Threads Per Block: %d\n", attr.maxThreadsPerBlock);          \
+    printf("Number of Registers: %d\n", attr.numRegs);                       \
+    printf("PTX Version: %d\n", attr.ptxVersion);                            \
+    printf("Binary Version: %d\n", attr.binaryVersion);                      \
+
+void cudaRunCheck()
+{
+    int rv, dv;
+    cudaDriverGetVersion(&dv);
+    cudaRuntimeGetVersion(&rv);
+    printf("CUDA Driver Version / Runtime Version  --- %d.%d / %d.%d\n", dv/1000, (dv%100)/10, rv/1000, (rv%100)/10);
+}
+
+void factor(int n, int* factors)
+{
+    int sq = std::sqrt(n);
+    int outf = n/sq;
+    while (outf * sq != n)
+    {
+        sq--;
+        outf=n/sq;
+    }
+    factors[0] = sq;
+    factors[1] = outf;
+}
+
+struct cudaTime
+{
+    std::vector<float> times;
+    cudaEvent_t start, stop;
+    int nTimes=0;
+	float ti;
+    std::string typ = "GPU";
+
+    cudaTime() {
+        cudaEventCreate( &start );
+	    cudaEventCreate( &stop );
+    };
+    ~cudaTime()
+    {
+        cudaEventDestroy( start );
+	    cudaEventDestroy( stop );
+    };
+
+    void tinit(){ cudaEventRecord( start, 0); };
+
+    void tfinal(double nTimesteps=1.0) {
+        cudaEventRecord(stop, 0);
+	    cudaEventSynchronize(stop);
+	    cudaEventElapsedTime( &ti, start, stop);
+        ti *= 1.0e3;
+        nTimes++;
+        times.push_back(ti/nTimesteps);
+    };
+
+    float getLastTime()
+    {
+        return times.back();
+    };
+
+    float avgt() {
+        double sumt = 0.0;
+        for (auto itime: times) sumt += itime;
+        return sumt/(double)times.size();
+    };
+
+    friend std::ostream& operator<< (std::ostream &out, const cudaTime &ct);
+};
+
+
+std::ostream& operator<< (std::ostream &out, const cudaTime &ct)
+{
+    out << "Simulation Time (us) - ";
+    for (auto itt: ct.times)
+        out << itt << " | ";
+
+    return out;
+}
+
+#ifdef MPIS
+#include <mpi.h>
+
+struct mpiTime
+{
+    std::vector<double> times;
+    double ti;
+
+    std::string typ = "CPU";
+
+    void tinit(){ ti = MPI_Wtime(); }
+
+    void tfinal() { times.push_back((MPI_Wtime()-ti)*1.e6); }
+
+    int avgt() {
+        double sumt = 0.0;
+        for (auto itime: times) sumt += itime;
+        return sumt/(double)times.size();
+    };
+};
+
+#endif
+// void atomicWrite(std::string st, std::vector<double> t)
+// {
+//     FILE *tTemp;
+//     MPI_Barrier(MPI_COMM_WORLD);
+
+//     for (int k=0; k<nprocs; k++)
+//     {
+//         if (ranks[1] == k)
+//         {
+//             tTemp = fopen(fname.c_str(), "a+");
+//             fseek(tTemp, 0, SEEK_END);
+//             fprintf(tTemp, "\n%d,%s", ranks[1], st.c_str());
+//             for (auto i = t.begin(); i != t.end(); ++i)
+//             {
+//                 fprintf(tTemp, ",%4f", *i);
+//             }
+//         }
+//         MPI_Barrier(MPI_COMM_WORLD);
+//     }
+// }
+
+#endif
\ No newline at end of file
diff --git a/investigations/flatLongTest/images/PaddedSpeedupvsAtomic.png b/investigations/flatLongTest/images/PaddedSpeedupvsAtomic.png
new file mode 100644
index 0000000..e298cc9
Binary files /dev/null and b/investigations/flatLongTest/images/PaddedSpeedupvsAtomic.png differ
diff --git a/investigations/flatLongTest/matplotlib-cpp b/investigations/flatLongTest/matplotlib-cpp
new file mode 160000
index 0000000..1d23b28
--- /dev/null
+++ b/investigations/flatLongTest/matplotlib-cpp
@@ -0,0 +1 @@
+Subproject commit 1d23b2883db61760bff4a15edbdf3944e7f91935
diff --git a/investigations/flatLongTest/padTest.cpp b/investigations/flatLongTest/padTest.cpp
new file mode 100644
index 0000000..fd2a8a8
--- /dev/null
+++ b/investigations/flatLongTest/padTest.cpp
@@ -0,0 +1,143 @@
+/**
+    Copyright (C) 2017 Kyle Niemeyer, niemeyek@oregonstate.edu AND
+    Daniel Magee, mageed@oregonstate.edu
+*/
+/*
+    This file is distribued under the MIT License.  See LICENSE at top level of directory or: <https://opensource.org/licenses/MIT>.
+*/
+
+//COMPILE LINE!
+// nvcc -o ./bin/KSOut KS1D_SweptShared.cu -gencode arch=compute_35,code=sm_35 -lm -restrict -Xcompiler -fopenmp --ptxas-options=-v
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <cuda_runtime.h>
+#include <device_functions.h>
+
+#include <iostream>
+#include <ostream>
+#include <algorithm>
+#include <iterator>
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+#include <fstream>
+
+#ifndef GPUNUM
+    #define GPUNUM              0
+#endif
+
+#define QUARTER     0.25
+#define HALF        0.5
+#define ONE         1.0
+#define TWO         2.0
+#define FOUR        4.0
+#define SIX			6.0
+
+#define NSTEPS              4 // How many substeps in timestep.
+
+// Since anyone would need to write a header and functions file, why not just hardwire this.
+// If the user's number of steps isn't a power of 2 use the other one.
+
+#define MODULA(x)           x & (NSTEPS-1)
+// #define MODULA(x)           x % NSTEPS
+
+#define DIVMOD(x)           (MODULA(x)) >> 1
+
+const double dx = 0.5;
+
+struct discConstants{
+
+	double dxTimes4;
+	double dx2;
+	double dx4;
+	double dt;
+	double dt_half;
+	int base;
+	int ht;
+    int idxend;
+};
+
+__constant__ discConstants disc;
+
+//Initial condition.
+__host__
+double initFun(double xnode)
+{
+	return TWO * cos(19.0*xnode*M_PI/128.0);
+}
+
+__device__ __forceinline__
+double fourthDer(double tfarLeft, double tLeft, double tCenter, double tRight, double tfarRight)
+{
+	return (tfarLeft - FOUR*tLeft + SIX*tCenter - FOUR*tRight + tfarRight)*(disc.dx4);
+}
+
+__device__ __forceinline__
+double secondDer(double tLeft, double tRight, double tCenter)
+{
+	return (tLeft + tRight - TWO*tCenter)*(disc.dx2);
+}
+
+__device__ __forceinline__
+double convect(double tLeft, double tRight)
+{
+	return (tRight*tRight - tLeft*tLeft)*(disc.dxTimes4);
+}
+
+struct runVar{
+    int bks, tpb, dv;
+    double dt, tf, lx;
+    size_t smem;
+
+    void printme()
+    {
+        cout << "KS ---" << endl;
+        cout << "#Blocks: " << bks << " | Length: " << lx << " | dt/dx: " << dt/dx << endl;
+        cout << "TPB: " << tpb << " Grid Size " << dv << endl;
+    }
+} rv;
+
+#include "KSAtomic.hpp"
+
+int main( int argc, char *argv[])
+{
+
+	cudaSetDevice(GPUNUM);
+	// if cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
+
+    rv.dv = atoi(argv[1]); //Number of spatial points
+	rv.tpb = atoi(argv[2]); //Threads per Block
+    rv.dt = atof(argv[3]); //delta T timestep
+	rv.tf = atof(argv[4]) - 0.25*rv.dt; //Finish time
+    rv.bks = rv.dv/rv.tpb; //The number of blocks
+    rv.lx = rv.dv*dx;
+    rv.smem = (2*rv.tpb+8)*szSt;
+
+    rv.printme();
+
+    discConstants dsc = {
+		ONE/(FOUR*dx), //dx
+		ONE/(dx*dx), //dx^2
+		ONE/(dx*dx*dx*dx), //dx^4
+		rv.dt, //dt
+		rv.dt*0.5, //dt half
+		rv.tpb + 4, //length of row of shared array
+		(rv.tpb+4)/2, //midpoint of shared array row
+		rv.dv-1 //last global thread id
+	};
+
+    cudaMemcpyToSymbol(disc,&dsc,sizeof(dsc));
+
+    double *arrayState;
+    states *atomicState;
+
+    // Initialize, time, write out.  Compare Array to Atomic.
+    // YOU HAVE 1 HOUR.
+
+    cudaFreeHost(arrayState);
+    cudaFreeHost(atomicState);
+
+    return 0;
+
+}
\ No newline at end of file
diff --git a/investigations/flatLongTest/rslts/atomicArray.csv b/investigations/flatLongTest/rslts/atomicArray.csv
new file mode 100644
index 0000000..38325e4
--- /dev/null
+++ b/investigations/flatLongTest/rslts/atomicArray.csv
@@ -0,0 +1,61 @@
+tpb,dv,sweptArray,sweptAtomic,classicArray,classicAtomic
+32,2048,2.40007,6.63434,10.4235,29.6741
+32,4096,2.6025,7.51959,12.404,32.9315
+32,8192,3.67676,12.7995,13.6601,47.2752
+32,16384,5.08457,19.3627,16.5338,66.6656
+32,32768,8.15502,33.3647,23.7059,110.6
+32,65536,14.7692,62.9635,37.4083,210.004
+32,131072,27.6137,119.673,71.1649,384.872
+32,262144,53.1766,232.851,129.996,737.279
+32,524288,104.275,459.421,247.739,1443.36
+32,1048576,207.71,912.714,483.13,2847.12
+64,2048,1.88515,5.63549,10.5808,29.8034
+64,4096,2.0648,6.41555,12.2258,35.0574
+64,8192,2.48611,9.1848,12.1253,44.8238
+64,16384,3.5951,24.7273,15.6067,65.5351
+64,32768,5.7921,46.2661,20.1162,100.891
+64,65536,10.2214,86.9771,29.9518,179.258
+64,131072,19.0078,171.308,52.3312,330.095
+64,262144,36.4424,339.954,92.4432,629.662
+64,524288,71.3321,677.602,172.641,1228.7
+64,1048576,141.045,1352.59,333.048,2426.57
+128,2048,1.53293,5.15251,10.1668,30.46
+128,4096,1.6771,6.14865,12.2177,37.4525
+128,8192,2.03253,8.64294,12.336,45.7973
+128,16384,2.8478,18.0308,15.5637,64.6168
+128,32768,4.58072,44.6867,20.4183,104.386
+128,65536,7.99725,83.013,29.927,182.722
+128,131072,14.8442,161.414,50.0962,335.065
+128,262144,28.267,321.015,88.2068,637.475
+128,524288,55.1163,638.64,164.32,1242.33
+128,1048576,108.843,1268.33,316.867,2452.8
+256,2048,1.3505,4.95406,11.3153,30.8443
+256,4096,1.57635,7.46392,12.4013,40.9095
+256,8192,1.92905,9.21015,13.8858,49.8651
+256,16384,2.6322,16.0087,15.3831,68.2373
+256,32768,4.22022,41.0684,20.4245,104.76
+256,65536,7.40611,68.8514,30.3287,186.47
+256,131072,13.5593,128.168,50.4761,339.494
+256,262144,25.8775,254.894,88.6498,642.157
+256,524288,50.4286,508.592,165.387,1247.21
+256,1048576,99.5966,1017.17,318.505,2459.41
+512,2048,1.52177,7.71707,12.0663,40.6267
+512,4096,1.52928,7.7432,12.3056,41.3265
+512,8192,2.00342,11.3485,13.6711,58.1932
+512,16384,2.51893,16.2842,17.0839,78.3168
+512,32768,4.5901,34.3451,21.7042,115.247
+512,65536,8.04788,56.7913,30.7422,195.298
+512,131072,14.6278,112.054,51.616,358.37
+512,262144,26.6582,218.602,89.9169,675.46
+512,524288,51.6933,419.75,167.19,1305.18
+512,1048576,101.67,826.456,321.632,2562.83
+1024,2048,1.97449,11.864,14.3608,62.6596
+1024,4096,1.98058,11.9167,14.5522,62.908
+1024,8192,1.99942,11.9681,15.0342,63.3168
+1024,16384,3.04024,24.1762,18.346,99.7006
+1024,32768,4.95251,36.956,23.664,138.721
+1024,65536,8.64458,62.194,33.9697,227.136
+1024,131072,14.7341,113.775,55.9044,395.561
+1024,262144,27.0346,215.657,97.4133,738.591
+1024,524288,52.0674,419.875,179.548,1418.75
+1024,1048576,103.674,828.409,344.129,2786.37
diff --git a/investigations/flatLongTest/rslts/atomicPadding.csv b/investigations/flatLongTest/rslts/atomicPadding.csv
new file mode 100644
index 0000000..aac4add
--- /dev/null
+++ b/investigations/flatLongTest/rslts/atomicPadding.csv
@@ -0,0 +1,61 @@
+tpb,dv,classicArray,classicAtomic,sweptArray,sweptAtomic
+32,2048,10.5145,37.2119,2.35491,10.6148
+32,4096,12.3915,45.3359,2.58066,11.9703
+32,8192,13.5635,67.7075,3.66284,20.8072
+32,16384,16.8177,104.992,5.08569,32.5191
+32,32768,23.6894,182.908,8.12744,56.7365
+32,65536,37.5551,337.811,14.7585,104.285
+32,131072,71.0992,624.913,27.6128,201.835
+32,262144,130.013,1212.95,53.1499,396.345
+32,524288,247.638,2379.28,104.263,784.76
+32,1048576,482.919,4718.92,206.56,1562.93
+64,2048,10.4119,37.267,1.85109,9.61128
+64,4096,12.1452,45.4394,2.0203,10.7847
+64,8192,12.104,62.9761,2.472,14.5321
+64,16384,15.5412,102.015,3.56851,43.8224
+64,32768,20.223,172.222,5.78267,80.5145
+64,65536,29.8674,316.25,10.2022,153.761
+64,131072,52.4047,588.891,19.0093,305.169
+64,262144,92.2957,1136.43,36.4602,610.572
+64,524288,172.54,2230.38,71.3658,1221.9
+64,1048576,332.946,4418.75,141.111,2443.84
+128,2048,10.2113,41.1027,1.52081,10.6074
+128,4096,12.2082,49.9857,1.66086,12.5047
+128,8192,12.3124,65.5395,2.02396,14.9668
+128,16384,15.6147,99.2663,2.84368,29.5843
+128,32768,20.4046,175.152,4.57271,61.0777
+128,65536,29.8627,326.316,8.01774,118.123
+128,131072,49.9871,607.73,14.8674,226.007
+128,262144,88.2125,1159.59,28.2828,445.944
+128,524288,164.271,2260.31,55.136,892.943
+128,1048576,316.65,4457.8,108.906,1784.42
+256,2048,11.3281,41.7556,1.34541,11.2674
+256,4096,12.3938,56.648,1.59529,14.5261
+256,8192,13.7884,72.7568,1.91884,17.2108
+256,16384,15.447,106.5,2.6278,26.6545
+256,32768,20.4602,178.801,4.21093,49.3025
+256,65536,30.1686,329.489,7.3595,98.4544
+256,131072,50.2862,606.564,13.6116,180.673
+256,262144,88.6015,1159.79,25.8546,348.425
+256,524288,165.201,2259.47,50.4757,682.442
+256,1048576,318.283,4456.44,99.6421,1355.42
+512,2048,12.1479,57.5493,1.51909,15.2436
+512,4096,12.1921,57.9564,1.52334,15.3118
+512,8192,13.5092,87.884,2.00571,20.6509
+512,16384,17.1218,121.834,2.60008,36.8136
+512,32768,21.7258,187.141,4.68618,61.4165
+512,65536,30.7689,342.344,7.92118,103.953
+512,131072,51.5981,625.877,14.7362,189.635
+512,262144,89.9209,1175.56,26.6776,364.381
+512,524288,167.145,2275.55,52.309,710.869
+512,1048576,321.477,4482.63,102.133,1410.78
+1024,2048,14.3089,88.6339,1.96935,21.3711
+1024,4096,14.6172,89.0899,1.97433,21.4486
+1024,8192,15.0064,90.181,1.98356,21.5563
+1024,16384,18.3972,155.839,3.12533,41.9673
+1024,32768,23.6427,221.682,4.9535,63.0426
+1024,65536,34.0302,386.226,8.7269,105.694
+1024,131072,55.8035,699.614,15.1361,193.193
+1024,262144,97.2882,1268.49,27.3374,377.109
+1024,524288,179.472,2443.64,52.7626,733.811
+1024,1048576,344.118,4774.14,104.366,1465.54
diff --git a/investigations/flatLongTest/runatom.py b/investigations/flatLongTest/runatom.py
new file mode 100644
index 0000000..5eda301
--- /dev/null
+++ b/investigations/flatLongTest/runatom.py
@@ -0,0 +1,98 @@
+
+import subprocess as sp
+import shlex
+import os
+import os.path as op
+import sys
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import time
+
+thispath = op.abspath(op.dirname(__file__))
+impath = op.join(thispath, "images")
+
+ext = ".pdf"
+
+plt.style.use("swept.mplstyle")
+def savePlot(fh, name):
+    plotfile = op.join(impath, name + ext)
+    fh.savefig(plotfile, dpi=200, bbox_inches="tight")
+
+
+def plotdf(df, subp, name, kwarg={}):
+	icols = df.columns.values
+	dframe = {}
+	fx, ax = plt.subplots(*subp, figsize=(10,8))
+	axx = ax.ravel()
+	ccols = list(icols[:2])
+	for ix, ic in enumerate(icols[2:]):
+		cidx = ccols+[ic]
+		ibo = df[cidx].pivot(*cidx).T
+		dframe[ic] = ibo
+		ibo.plot(ax=axx[ix], title=ic, **kwarg)
+
+	fx.subplots_adjust(bottom=0.08, right=0.85, top=0.9, wspace=0.15, hspace=0.3)
+	fx.tight_layout()
+	savePlot(fx, name)
+	return dframe
+
+def minkey(df, ky):
+	dfo = df.groupby(ky).min(axis=1).drop('tpb', axis=1)
+	return dfo
+
+
+def quickPlot(f):
+	idf = pd.read_csv(f, header=0)
+	idf.rename({"dv": "gridSize"}, axis=1)
+	icols = idf.columns.values
+	name = f.split(".")[0]
+	plotdf(idf, [2,2], name+"Raw", kwarg={"loglog": True})
+	speedup = idf[icols[:2]]
+	speedup[icols[2][:5]] = idf[icols[3]] / idf[icols[2]]
+	speedup[icols[4][:5]] = idf[icols[5]] / idf[icols[4]]
+	# speedup.columns 
+	plotdf(speedup, [1, 2], name+"Speedup", kwarg={"logx": True})
+
+	return idf, speedup
+
+def getPlotBest(f):
+	idf = pd.read_csv(f, header=0)
+	minz = idf.groupby('dv').min().drop('tpb', axis=1)
+	mcol = minz.columns.values
+	speedz = pd.DataFrame()
+	print(minz)
+	speedz["Swept"] = minz[mcol[1]]/minz[mcol[0]]
+	speedz["Classic"] = minz[mcol[3]]/minz[mcol[2]]
+	ax = speedz.plot(logx=True)
+	ax.set_ylabel("Speedup")
+	ax.set_xlabel("Number of Spatial Points")
+	savePlot(ax.figure, "Speedup" + f.split(".")[0][-6:])
+
+
+if __name__ == "__main__":
+
+	if len(sys.argv) > 1: 
+		print("Just Reloading")
+		sys.exit(-1)
+
+	tpb = [2**k for k in range(5,11)]
+	nx = [2**k for k in range(11,21)]
+
+	ex = "./bin/KSmain "
+
+	times = " 0.001 10"
+
+	for t in tpb:
+		for n in nx:
+			strnia = ex + "{:d} {:d}".format(n, t) + times
+
+			exstr = shlex.split(strnia)
+			proc = sp.Popen(exstr)
+			sp.Popen.wait(proc)
+			time.sleep(3)
+
+	ss = "atomicArray.csv"
+	# framed = quickPlot(ss)
+	# plt.show()
+
diff --git a/investigations/flatLongTest/swept.mplstyle b/investigations/flatLongTest/swept.mplstyle
new file mode 100644
index 0000000..fd15355
--- /dev/null
+++ b/investigations/flatLongTest/swept.mplstyle
@@ -0,0 +1,17 @@
+
+lines.linewidth: 3
+lines.markersize: 8
+figure.figsize: 6, 4
+figure.titlesize: xx-large
+figure.titleweight: bold
+axes.titlesize: x-large   
+axes.labelsize: large
+xtick.major.size: 6
+xtick.labelsize: large
+ytick.labelsize: large
+ytick.major.size: 6
+grid.alpha: 0.5
+axes.grid: True
+savefig.dpi: 1000
+savefig.bbox: tight
+font.family: serif
diff --git a/runtools/timing_analysis.py b/runtools/timing_analysis.py
index dd64b00..a9a06a0 100644
--- a/runtools/timing_analysis.py
+++ b/runtools/timing_analysis.py
@@ -153,6 +153,7 @@ def getFitRs(dfp):
         sp, inter, r2, p, std_err = scipy.stats.linregress(np.array(dfa.index), dfa[s])
         inter = np.exp(inter)
         spcol[s] = [sp, inter, r2]
+        
     spcol.index = ["A", "b", "r2"]
     return spcol