diff --git a/README.md b/README.md
index b32f707..ba11b82 100644
--- a/README.md
+++ b/README.md
@@ -16,10 +16,13 @@ For the list of all the contributors to the development of FLUPS, description an
 FLUPS' design, implementation, and performances are described in two papers.
 
 If you use FLUPS, please cite them in your publications:
-- [Balty et al.](https://arxiv.org/abs/2211.07777), **FLUPS - a flexible and performant massively parallel Fourier transform library**, submitted, 2022
+- [Balty et al.](https://arxiv.org/abs/2211.07777), **FLUPS - a flexible and performant massively parallel Fourier transform library**, IEEE Transactions on Parallel and Distributed Systems, 2023
 - [Caprace et al.](https://arxiv.org/abs/2006.09300), **FLUPS - A Fourier-based Library of Unbounded Poisson Solvers**, SIAM Journal on Scientific Computing, 2021
 
 
+The high-order Lattice Green's functions (LGF and MEHR) available in FLUPS are described in a third paper. If you use those kernels, please cite the related paper in your publications:
+- [Gabbard et al.](https://arxiv.org/abs/2309.13503), **Lattice Green’s Functions for High-Order Finite Difference Stencils**, SIAM Journal on Numerical Analysis, 2024
+
 ## Why should you use FLUPS?
 - You can solve the Poisson on rectangular and uniform distributed grids;
 - You can use either cell-centred or node-centred data layout;
@@ -231,7 +234,8 @@ flups_solve(mysolver,rhs, rhs);
 
 Then, destroy the solver and the created topology
 ```
-flups_cleanup(mysolver);
+flups_cleanup(mysolver); // destroy the solver
+flups_cleanup_fftw();    // cleanup the fftw stuff
 flups_topo_free(topo);
 for (int id = 0; id < 3; id++) {
     for (int is = 0; is < 2; is++) {
diff --git a/kernel/MEHR_4F_2d_32.ker b/kernel/MEHR_4F_2d_32.ker
new file mode 100644
index 0000000..8743a46
Binary files /dev/null and b/kernel/MEHR_4F_2d_32.ker differ
diff --git a/kernel/MEHR_4L6L_2d_32.ker b/kernel/MEHR_4L6L_2d_32.ker
new file mode 100644
index 0000000..380dfd9
Binary files /dev/null and b/kernel/MEHR_4L6L_2d_32.ker differ
diff --git a/kernel/MEHR_6F_2d_32.ker b/kernel/MEHR_6F_2d_32.ker
new file mode 100644
index 0000000..3700638
Binary files /dev/null and b/kernel/MEHR_6F_2d_32.ker differ
diff --git a/samples/compareACCFFT/main.cpp b/samples/compareACCFFT/main.cpp
index c2bfa32..b1f1b7c 100644
--- a/samples/compareACCFFT/main.cpp
+++ b/samples/compareACCFFT/main.cpp
@@ -7,10 +7,9 @@
 #include <iostream>
 
 #include "accfft.h"
-#include "h3lpr/profiler.hpp"
-#include "h3lpr/parser.hpp"
 #include "flups.h"
-
+#include "h3lpr/parser.hpp"
+#include "h3lpr/profiler.hpp"
 
 int main(int argc, char *argv[]) {
     //-------------------------------------------------------------------------
@@ -31,29 +30,28 @@ int main(int argc, char *argv[]) {
     // Get info from the command line
     //--------------------------------------------------------------------------
     H3LPR::Parser parser(argc, (const char **)argv);
-    const auto    arg_nglob = parser.GetValues<int, 3>("--nglob", "the global resolution, will be used for both ACCFFT and FLUPS", {64,64,64});
+    const auto    arg_nglob = parser.GetValues<int, 3>("--nglob", "the global resolution, will be used for both ACCFFT and FLUPS", {64, 64, 64});
     const auto    arg_nproc = parser.GetValues<int, 3>("--nproc", "the proc distribution, for FLUPS only", {1, 1, 1});
     const auto    arg_dom   = parser.GetValues<double, 3>("--dom", "the size of the domain, must be compatible with nglob", {1.0, 1.0, 1.0});
     const int     n_iter    = parser.GetValue<int>("--niter", "the number of iterations to perform", 20);
     const int     n_warm    = parser.GetValue<int>("--warm", "the number of iterations to perform when warming up", 1);
-    const bool profile = parser.GetFlag("--profile","forward the profiler to flups");
+    const bool    profile   = parser.GetFlag("--profile", "forward the profiler to flups");
     parser.Finalize();
 
     //--------------------------------------------------------------------------
     // Definition of the problem
     //--------------------------------------------------------------------------
-    const int nglob[3] = {arg_nglob[0], arg_nglob[1], arg_nglob[2]};
-    const int nproc[3] = {arg_nproc[0], arg_nproc[1], arg_nproc[2]};
-    const double L[3]  = {arg_dom[0], arg_dom[1], arg_dom[2]};
-    
+    const int    nglob[3] = {arg_nglob[0], arg_nglob[1], arg_nglob[2]};
+    const int    nproc[3] = {arg_nproc[0], arg_nproc[1], arg_nproc[2]};
+    const double L[3]     = {arg_dom[0], arg_dom[1], arg_dom[2]};
 
     // get the grid spacing
     const double h[3] = {L[0] / nglob[0], L[1] / nglob[1], L[2] / nglob[2]};
 
     // get the PER PER PER BC everywhere
     const FLUPS_CenterType center_type[3] = {CELL_CENTER, CELL_CENTER, CELL_CENTER};
-    //const FLUPS_CenterType center_type[3] = {NODE_CENTER, NODE_CENTER, NODE_CENTER};
-    FLUPS_BoundaryType    *mybc[3][2];
+    // const FLUPS_CenterType center_type[3] = {NODE_CENTER, NODE_CENTER, NODE_CENTER};
+    FLUPS_BoundaryType *mybc[3][2];
     for (int id = 0; id < 3; id++) {
         for (int is = 0; is < 2; is++) {
             mybc[id][is]    = (FLUPS_BoundaryType *)flups_malloc(sizeof(int) * 1);
@@ -61,7 +59,7 @@ int main(int argc, char *argv[]) {
         }
     }
 
-     //..........................................................................
+    //..........................................................................
     // Display
     flups_info(argc, argv);
     if (rank == 0) {
@@ -75,11 +73,9 @@ int main(int argc, char *argv[]) {
         printf("--------------------------------------------------------------\n");
     }
 
-
-
     //--------------------------------------------------------------------------
-    std::string prof_name = "beatme_nglob" + std::to_string(nglob[0]) +"_"+ std::to_string(nglob[1]) + "_" + std::to_string(nglob[2]) + "_nrank" + std::to_string(comm_size);
-    H3LPR::Profiler    prof(prof_name);
+    std::string     prof_name = "beatme_nglob" + std::to_string(nglob[0]) + "_" + std::to_string(nglob[1]) + "_" + std::to_string(nglob[2]) + "_nrank" + std::to_string(comm_size);
+    H3LPR::Profiler prof(prof_name);
 
     //--------------------------------------------------------------------------
     /** - Initialize FLUPS */
@@ -87,17 +83,17 @@ int main(int argc, char *argv[]) {
     if (rank == 0) printf("Initialization of FLUPS\n");
 
     // create a real topology
-    FLUPS_Profiler* flups_prof = (profile)? (FLUPS_Profiler*) &prof : nullptr;
-    FLUPS_Topology *topoTmp = flups_topo_new(0, 1, nglob, nproc, false, NULL, FLUPS_ALIGNMENT, comm);
-    FLUPS_Solver  *mysolver = flups_init_timed(topoTmp, mybc, h, L, NOD, center_type, flups_prof);
+    FLUPS_Profiler *flups_prof = (profile) ? (FLUPS_Profiler *)&prof : nullptr;
+    FLUPS_Topology *topoTmp    = flups_topo_new(0, 1, nglob, nproc, false, NULL, FLUPS_ALIGNMENT, comm);
+    FLUPS_Solver   *mysolver   = flups_init_timed(topoTmp, mybc, h, L, NOD, center_type, flups_prof);
 
     // set the CHAT2 green type (even if it's not used)
     flups_set_greenType(mysolver, CHAT_2);
     flups_setup(mysolver, true);
-    double *solFLU = flups_get_innerBuffer(mysolver);  
+    double *solFLU = flups_get_innerBuffer(mysolver);
 
     // to fill the data we use the inner topo
-    const Topology *topoIn =flups_get_innerTopo_physical(mysolver);
+    const Topology *topoIn = flups_get_innerTopo_physical(mysolver);
     // instruct the solver to skip the first ST
     flups_skip_firstSwitchtopo(mysolver);
 
@@ -115,18 +111,18 @@ int main(int argc, char *argv[]) {
 
     //..........................................................................
     // set some straightforward data
-    int start_id[3]; 
+    int start_id[3];
     flups_topo_get_istartGlob(topoIn, start_id);
     int topo_nmem[3] = {flups_topo_get_nmem(topoIn, 0), flups_topo_get_nmem(topoIn, 1), flups_topo_get_nmem(topoIn, 2)};
 
     // set a simple expression
     double val = 0.0;
-    for (int i2 = 0; i2 < flups_topo_get_nloc(topoIn, 2); ++i2){
-	for(int i1 = 0; i1 < flups_topo_get_nloc(topoIn, 1); ++ i1){
-	      for(int i0 = 0; i0 < flups_topo_get_nloc(topoIn, 0); ++i0){
-                //double x   = 2.0 * M_PI / nglob[0] * (i0 + topoIn->cmpt_start_id(0));
-                //double y   = 2.0 * M_PI / nglob[1] * (i1 + topoIn->cmpt_start_id(1));
-                //double z   = 2.0 * M_PI / nglob[2] * (i2 + topoIn->cmpt_start_id(2));
+    for (int i2 = 0; i2 < flups_topo_get_nloc(topoIn, 2); ++i2) {
+        for (int i1 = 0; i1 < flups_topo_get_nloc(topoIn, 1); ++i1) {
+            for (int i0 = 0; i0 < flups_topo_get_nloc(topoIn, 0); ++i0) {
+                // double x   = 2.0 * M_PI / nglob[0] * (i0 + topoIn->cmpt_start_id(0));
+                // double y   = 2.0 * M_PI / nglob[1] * (i1 + topoIn->cmpt_start_id(1));
+                // double z   = 2.0 * M_PI / nglob[2] * (i2 + topoIn->cmpt_start_id(2));
                 double x   = 2.0 * M_PI / nglob[0] * (i0 + start_id[0]);
                 double y   = 2.0 * M_PI / nglob[1] * (i1 + start_id[1]);
                 double z   = 2.0 * M_PI / nglob[2] * (i2 + start_id[2]);
@@ -147,9 +143,9 @@ int main(int argc, char *argv[]) {
     accfft_create_comm(MPI_COMM_WORLD, c_dims, &c_comm);
 
     // let ACCFFT decide on the topology choice, pencil in Z, as always
-    int    isize[3], osize[3], istart[3], ostart[3];
+    int isize[3], osize[3], istart[3], ostart[3];
 
-    int n_acc[3] = {nglob[2],nglob[1],nglob[0]};
+    int    n_acc[3]  = {nglob[2], nglob[1], nglob[0]};
     size_t alloc_max = accfft_local_size_dft_r2c(n_acc, isize, istart, osize, ostart, c_comm);
 
     double *data_acc = (double *)accfft_alloc(alloc_max);
@@ -278,6 +274,7 @@ int main(int argc, char *argv[]) {
         }
     }
 
+    flups_cleanup_fftw();
     MPI_Finalize();
 }
 
diff --git a/samples/compareP3DFFT++/main_compare++.cpp b/samples/compareP3DFFT++/main_compare++.cpp
index d5c35ed..5d390cc 100644
--- a/samples/compareP3DFFT++/main_compare++.cpp
+++ b/samples/compareP3DFFT++/main_compare++.cpp
@@ -9,32 +9,32 @@
 #include "p3dfft.h"
 
 void print_res(p3dfft::complex_double *A, int *sdims, int *gstart);
-void print_res(double *A, const Topology* topo);
+void print_res(double *A, const Topology *topo);
 
 /**
  * @brief Compare P3DFFT++ and FLUPS on the same FFT 3D
- * 
+ *
  * This code will time both libraries on a give FFT 3D: the full periodic/DFT.
  * Notice that P3D requires the data to be already in pencils, which
  * is not the case of FLUPS. To do a fair comparison, the time required
- * by FLUPS to switch from the initial topology to the inner pencil topology 
+ * by FLUPS to switch from the initial topology to the inner pencil topology
  * (Switch0) must be substracted from the total time of the solve.
- * 
+ *
  * Alternatively, we could have chosen to do an ODD-ODD/DST2 in the x direction.
- * As a result, FLUPS would automatically skip Switch0. However, we could not 
+ * As a result, FLUPS would automatically skip Switch0. However, we could not
  * manage to do that case with P3DFFT...
- * 
- * @param argc 
- * @param argv 
- * @return int 
+ *
+ * @param argc
+ * @param argv
+ * @return int
  */
 int main(int argc, char *argv[]) {
     //-------------------------------------------------------------------------
     // Default values
     //-------------------------------------------------------------------------
-    int n_iter = 10;
-    int res[3] = {16,16,16};
-    int nproc2D[2] = {1,2};
+    int n_iter     = 10;
+    int res[3]     = {16, 16, 16};
+    int nproc2D[2] = {1, 2};
 
     //-------------------------------------------------------------------------
     // Parse arguments
@@ -47,114 +47,114 @@ int main(int argc, char *argv[]) {
             printf(" --niter, -ni Ni :              Ni is the number of times we call the same 3D FFT (for statistics on the profiler) \n");
             return 0;
         } else if ((arg == "-np") || (arg == "--nprocs")) {
-            for (int j = 0; j<2;j++){
-                if (i + j + 1 < argc) { // Make sure we aren't at the end of argv!
-                    nproc2D[j] = atoi(argv[i+j+1]); 
-                    if(nproc2D[j]<1){
+            for (int j = 0; j < 2; j++) {
+                if (i + j + 1 < argc) {  // Make sure we aren't at the end of argv!
+                    nproc2D[j] = atoi(argv[i + j + 1]);
+                    if (nproc2D[j] < 1) {
                         fprintf(stderr, "nprocs must be >0\n");
                         return 1;
                     }
-                } else { //Missing argument
+                } else {  // Missing argument
                     fprintf(stderr, "missing argument in --nprocs\n");
                     return 1;
-                }  
+                }
             }
-            i+=2;
-        }  else if ((arg == "-res")|| (arg== "--resolution") ) {
-            for (int j = 0; j<3;j++){
-                if (i + j + 1 < argc) { // Make sure we aren't at the end of argv!
-                    res[j] = atoi(argv[i+j+1]); 
-                    if(res[j]<=0.0){
+            i += 2;
+        } else if ((arg == "-res") || (arg == "--resolution")) {
+            for (int j = 0; j < 3; j++) {
+                if (i + j + 1 < argc) {  // Make sure we aren't at the end of argv!
+                    res[j] = atoi(argv[i + j + 1]);
+                    if (res[j] <= 0.0) {
                         fprintf(stderr, "res must be >0\n");
                         return 1;
                     }
-                } else { //Missing argument
+                } else {  // Missing argument
                     fprintf(stderr, "missing argument in -res\n");
                     return 1;
-                }  
+                }
             }
-            i+=3;
-        }  else if ((arg == "-ni")|| (arg== "--niter") ) {
-            if (i + 1 < argc) { // Make sure we aren't at the end of argv!
-                n_iter = atoi(argv[i+1]); 
-                if(n_iter<1){
+            i += 3;
+        } else if ((arg == "-ni") || (arg == "--niter")) {
+            if (i + 1 < argc) {  // Make sure we aren't at the end of argv!
+                n_iter = atoi(argv[i + 1]);
+                if (n_iter < 1) {
                     fprintf(stderr, "niter must be >0\n");
                     return 1;
                 }
-            } else { //Missing argument
+            } else {  // Missing argument
                 fprintf(stderr, "missing --niter\n");
                 return 1;
-            }  
+            }
             i++;
-        // } else if ((arg == "-bc")|| (arg== "--boundary-conditions") ) {
-        //     for (int j = 0; j<6;j++){
-        //         if (i + j + 1 < argc) { // Make sure we aren't at the end of argv!
-        //             bcdef[j/2][j%2] = (BoundaryType) atoi(argv[i+j+1]); 
-        //         } else { //Missing argument
-        //             fprintf(stderr, "missing argument in --boundary-conditions\n");
-        //             return 1;
-        //         }  
-        //     }
-        //     i+=6;
+            // } else if ((arg == "-bc")|| (arg== "--boundary-conditions") ) {
+            //     for (int j = 0; j<6;j++){
+            //         if (i + j + 1 < argc) { // Make sure we aren't at the end of argv!
+            //             bcdef[j/2][j%2] = (BoundaryType) atoi(argv[i+j+1]);
+            //         } else { //Missing argument
+            //             fprintf(stderr, "missing argument in --boundary-conditions\n");
+            //             return 1;
+            //         }
+            //     }
+            //     i+=6;
         }
     }
-    
+
     //-------------------------------------------------------------------------
     // Initialize MPI
     //-------------------------------------------------------------------------
-    int rank, comm_size;
+    int      rank, comm_size;
     MPI_Comm comm = MPI_COMM_WORLD;
 
     int provided;
     // set MPI_THREAD_FUNNELED or MPI_THREAD_SERIALIZED
     int requested = MPI_THREAD_FUNNELED;
     MPI_Init_thread(&argc, &argv, requested, &provided);
-    if(provided < requested){
+    if (provided < requested) {
         FLUPS_ERROR("The MPI-provided thread behavior does not match");
     }
-   
+
     MPI_Comm_rank(comm, &rank);
     MPI_Comm_size(comm, &comm_size);
 
     //-------------------------------------------------------------------------
-    //Definition of the problem
+    // Definition of the problem
     //-------------------------------------------------------------------------
-    const int     nglob[3] = {res[0], res[1], res[2]};
-    const int     nproc[3] = {1, nproc2D[0], nproc2D[1]}; //nproc[0] has to be 1 //CAUTION FOR THIS: nproc[1]<=nproc[2] !!!
-    const double  L[3]     = {1., 1., 1.};;
+    const int    nglob[3] = {res[0], res[1], res[2]};
+    const int    nproc[3] = {1, nproc2D[0], nproc2D[1]};  // nproc[0] has to be 1 //CAUTION FOR THIS: nproc[1]<=nproc[2] !!!
+    const double L[3]     = {1., 1., 1.};
+    ;
 
     const double h[3] = {L[0] / nglob[0], L[1] / nglob[1], L[2] / nglob[2]};
 
-    FLUPS_BoundaryType* mybc[3][2];
-    for(int id=0; id<3; id++){
-        for(int is=0; is<2; is++){
-            mybc[id][is] = (FLUPS_BoundaryType*) flups_malloc(sizeof(int)*1);
+    FLUPS_BoundaryType *mybc[3][2];
+    for (int id = 0; id < 3; id++) {
+        for (int is = 0; is < 2; is++) {
+            mybc[id][is]    = (FLUPS_BoundaryType *)flups_malloc(sizeof(int) * 1);
             mybc[id][is][0] = PER;
         }
     }
 
-    if(comm_size!=nproc[0]*nproc[1]*nproc[2])
+    if (comm_size != nproc[0] * nproc[1] * nproc[2])
         FLUPS_ERROR("Invalid number of procs");
 
-
     //-------------------------------------------------------------------------
     /** - Initialize FLUPS */
     //-------------------------------------------------------------------------
 
     // create a real topology
-    int FLUnmemIn[3],FLUnmemOUT[3];
-    Topology *topoIn      = new Topology(0, 1, nglob, nproc, false, NULL, FLUPS_ALIGNMENT, comm);
-    const int  nprocOut[3] = {1, 2, 1};
-    const int  nglobOut[3] = {17, 32, 64};
+    int                    FLUnmemIn[3], FLUnmemOUT[3];
+    Topology              *topoIn         = new Topology(0, 1, nglob, nproc, false, NULL, FLUPS_ALIGNMENT, comm);
+    const int              nprocOut[3]    = {1, 2, 1};
+    const int              nglobOut[3]    = {17, 32, 64};
     const FLUPS_CenterType center_type[3] = {CELL_CENTER, CELL_CENTER, CELL_CENTER};
-    
+
     // prepare profiling
 #ifdef SKIP_P3D
-    std::string FLUPSprof = "only_FLUPS_res" + std::to_string((int)(nglob[0]/L[0])) + "_nrank" + std::to_string(comm_size)+"_nthread" + std::to_string(omp_get_max_threads());
+    std::string FLUPSprof = "only_FLUPS_res" + std::to_string((int)(nglob[0] / L[0])) + "_nrank" + std::to_string(comm_size) + "_nthread" + std::to_string(omp_get_max_threads());
 #else
-    std::string FLUPSprof = "compare_FLUPS_res" + std::to_string((int)(nglob[0]/L[0])) + "_nrank" + std::to_string(comm_size)+"_nthread" + std::to_string(omp_get_max_threads());
+    std::string FLUPSprof = "compare_FLUPS_res" + std::to_string((int)(nglob[0] / L[0])) + "_nrank" + std::to_string(comm_size) + "_nthread" + std::to_string(omp_get_max_threads());
 #endif
-    Profiler* FLUprof = new Profiler(FLUPSprof);
+    Profiler *FLUprof = new Profiler(FLUPSprof);
 
     // solver creation and init
     Solver *mysolver = new Solver(topoIn, mybc, h, L, NOD, center_type, FLUprof);
@@ -166,13 +166,13 @@ int main(int argc, char *argv[]) {
     MPI_Comm_rank(comm, &rank);
 
     // retrieveing internal info from the solver
-    const Topology *topoSpec    = mysolver->get_innerTopo_spectral();
+    const Topology *topoSpec = mysolver->get_innerTopo_spectral();
 
     for (int i = 0; i < 3; i++) {
         FLUnmemIn[i]  = topoIn->nmem(i);
         FLUnmemOUT[i] = topoSpec->nmem(i);
     }
-    
+
     int istartGloOut[3], istartGlo[3];
     topoIn->get_istart_glob(istartGlo);
     topoSpec->get_istart_glob(istartGloOut);
@@ -185,143 +185,146 @@ int main(int argc, char *argv[]) {
     //-------------------------------------------------------------------------
 #ifndef SKIP_P3D
     int conf;
-    int dims[2] = {nproc[1],nproc[2]};
-    int  proc_order[3] = {0, 1, 2};
-    
+    int dims[2]       = {nproc[1], nproc[2]};
+    int proc_order[3] = {0, 1, 2};
+
     if (rank == 0)
         printf("Using processor grid %d x %d with pencils in x\n", dims[0], dims[1]);
 
-    std::string P3DFFTprof = "compare_P3DFFT_res" + std::to_string((int)(nglob[0]/L[0])) + "_nrank" + std::to_string(comm_size)+"_nthread" + std::to_string(omp_get_max_threads());
-    Profiler* P3Dprof = new Profiler(P3DFFTprof);
+    std::string P3DFFTprof = "compare_P3DFFT_res" + std::to_string((int)(nglob[0] / L[0])) + "_nrank" + std::to_string(comm_size) + "_nthread" + std::to_string(omp_get_max_threads());
+    Profiler   *P3Dprof    = new Profiler(P3DFFTprof);
 
     P3Dprof->create("init", "root");
     P3Dprof->create("FFTandSwitch", "root");
 
     /* Initialize P3DFFT */
     P3Dprof->start("init");
-    if(rank==0)
-        printf("Initilizing P3D...\n");    fflush(stdout);
+    if (rank == 0)
+        printf("Initilizing P3D...\n");
+    fflush(stdout);
     p3dfft::setup();
-    if(rank==0)
-        printf("...set up.\n");    fflush(stdout);
+    if (rank == 0)
+        printf("...set up.\n");
+    fflush(stdout);
 
     // Define the transform types for these two transforms
-    int type_ids1[3] = {p3dfft::R2CFFT_D,p3dfft::CFFT_FORWARD_D ,p3dfft::CFFT_FORWARD_D };
-    int type_ids2[3] = {p3dfft::C2RFFT_D,p3dfft::CFFT_BACKWARD_D,p3dfft::CFFT_BACKWARD_D};
-    //Unfortunately, P3D segfaults when you do a DST2_REAL_D followed by a R2CFFT_D !!
-    p3dfft::trans_type3D type_rcc(type_ids1); 
+    int type_ids1[3] = {p3dfft::R2CFFT_D, p3dfft::CFFT_FORWARD_D, p3dfft::CFFT_FORWARD_D};
+    int type_ids2[3] = {p3dfft::C2RFFT_D, p3dfft::CFFT_BACKWARD_D, p3dfft::CFFT_BACKWARD_D};
+    // Unfortunately, P3D segfaults when you do a DST2_REAL_D followed by a R2CFFT_D !!
+    p3dfft::trans_type3D type_rcc(type_ids1);
     p3dfft::trans_type3D type_ccr(type_ids2);
 
     // input grid
-    int  gdimsIN[3]      = {topoIn->nglob(0), topoIn->nglob(1), topoIn->nglob(2)};
-    int  mem_orderIN[3]  = {0, 1, 2};
-    int  nprocIN[3]      = {nproc[0], nproc[1], nproc[2]};
-    p3dfft::grid gridIN(gdimsIN,-1,nprocIN,proc_order,mem_orderIN,comm); 
+    int          gdimsIN[3]     = {topoIn->nglob(0), topoIn->nglob(1), topoIn->nglob(2)};
+    int          mem_orderIN[3] = {0, 1, 2};
+    int          nprocIN[3]     = {nproc[0], nproc[1], nproc[2]};
+    p3dfft::grid gridIN(gdimsIN, -1, nprocIN, proc_order, mem_orderIN, comm);
 
-    if(rank==0)
-        printf("...input grid.\n");    fflush(stdout);
+    if (rank == 0)
+        printf("...input grid.\n");
+    fflush(stdout);
 
     // ouput grid
     int gdimsOUT[3];
-    for(int i=0; i < 3;i++) 
+    for (int i = 0; i < 3; i++)
         gdimsOUT[i] = gdimsIN[i];
-    gdimsOUT[0] = gdimsOUT[0]/2+1;
-    int mem_orderOUT[3]  = {2, 1, 0}; //blindly mimicking samples, anything else produces a segfault anyway...
-    int nprocOUT[3]      = {dims[0],dims[1],1};
-    p3dfft::grid gridOUT(gdimsOUT,0,nprocOUT,proc_order,mem_orderOUT,comm); 
+    gdimsOUT[0]                  = gdimsOUT[0] / 2 + 1;
+    int          mem_orderOUT[3] = {2, 1, 0};  // blindly mimicking samples, anything else produces a segfault anyway...
+    int          nprocOUT[3]     = {dims[0], dims[1], 1};
+    p3dfft::grid gridOUT(gdimsOUT, 0, nprocOUT, proc_order, mem_orderOUT, comm);
 
-    if(rank==0)
-        printf("...output grid.\n");    fflush(stdout);
+    if (rank == 0)
+        printf("...output grid.\n");
+    fflush(stdout);
 
     // Set up 3D transforms, including stages and plans, for forward trans.
-    p3dfft::transform3D<double,p3dfft::complex_double> trans_f(gridIN,gridOUT,&type_rcc);
+    p3dfft::transform3D<double, p3dfft::complex_double> trans_f(gridIN, gridOUT, &type_rcc);
     // Set up 3D transforms, including stages and plans, for backward trans.
-    p3dfft::transform3D<p3dfft::complex_double,double> trans_b(gridOUT,gridIN,&type_ccr);
-    if(rank==0)
-        printf("...transforms.\n");    fflush(stdout);
+    p3dfft::transform3D<p3dfft::complex_double, double> trans_b(gridOUT, gridIN, &type_ccr);
+    if (rank == 0)
+        printf("...transforms.\n");
+    fflush(stdout);
 
     P3Dprof->stop("init");
-    if(rank==0)
-        printf("Done with P3D init.\n");    fflush(stdout);
+    if (rank == 0)
+        printf("Done with P3D init.\n");
+    fflush(stdout);
 
     p3dfft::timers.init();
 
-
     // Find local dimensions in storage order, and also the starting position of the local array in the global array
-  
-    //input grid - SIZE IN DOUBLEs
-    int P3DnlocIN[3],glob_startIN[3];
-    for(int i=0;i<3;i++) {
-        P3DnlocIN[mem_orderIN[i]] = gridIN.ldims[i];
+
+    // input grid - SIZE IN DOUBLEs
+    int P3DnlocIN[3], glob_startIN[3];
+    for (int i = 0; i < 3; i++) {
+        P3DnlocIN[mem_orderIN[i]]    = gridIN.ldims[i];
         glob_startIN[mem_orderIN[i]] = gridIN.glob_start[i];
     }
-    int P3DmemsizeIN = P3DnlocIN[0]*P3DnlocIN[1]*P3DnlocIN[2];
+    int P3DmemsizeIN = P3DnlocIN[0] * P3DnlocIN[1] * P3DnlocIN[2];
 
-    //output grid - SIZE IN COMPLEXes
-    int P3DnlocOUT[3],glob_startOUT[3];
-    for(int i=0;i<3;i++) {
-        P3DnlocOUT[mem_orderOUT[i]] = gridOUT.ldims[i];
+    // output grid - SIZE IN COMPLEXes
+    int P3DnlocOUT[3], glob_startOUT[3];
+    for (int i = 0; i < 3; i++) {
+        P3DnlocOUT[mem_orderOUT[i]]    = gridOUT.ldims[i];
         glob_startOUT[mem_orderOUT[i]] = gridOUT.glob_start[i];
     }
-    int P3DmemsizeOUT = P3DnlocOUT[0]*P3DnlocOUT[1]*P3DnlocOUT[2];
+    int P3DmemsizeOUT = P3DnlocOUT[0] * P3DnlocOUT[1] * P3DnlocOUT[2];
 #endif
-    
 
     // //-------------------------------------------------------------------------
     // /** - allocate rhs and solution */
     // //-------------------------------------------------------------------------
-    if(rank == 0) {
-        printf("[FLUPS] topo IN glob : %d %d %d \n",topoIn->nglob(0),topoIn->nglob(1),topoIn->nglob(2));
-        printf("[FLUPS] topo IN loc : %d*%d*%d = %d (check: %d %d %d)\n",topoIn->nmem(0),topoIn->nmem(1),topoIn->nmem(2),topoIn->memsize(),topoIn->nloc(0),topoIn->nloc(1),topoIn->nloc(2));
-        printf("[FLUPS] topo OUT glob : %d %d %d \n",topoSpec->nglob(0),topoSpec->nglob(1),topoSpec->nglob(2));
-        printf("[FLUPS] topo OUT loc  : nmem: %d*%d*%d nf:%d (nloc: %d %d %d)  \n",topoSpec->nmem(0),topoSpec->nmem(1),topoSpec->nmem(2),topoSpec->nf(),topoSpec->nloc(0),topoSpec->nloc(1),topoSpec->nloc(2));
+    if (rank == 0) {
+        printf("[FLUPS] topo IN glob : %d %d %d \n", topoIn->nglob(0), topoIn->nglob(1), topoIn->nglob(2));
+        printf("[FLUPS] topo IN loc : %d*%d*%d = %d (check: %d %d %d)\n", topoIn->nmem(0), topoIn->nmem(1), topoIn->nmem(2), topoIn->memsize(), topoIn->nloc(0), topoIn->nloc(1), topoIn->nloc(2));
+        printf("[FLUPS] topo OUT glob : %d %d %d \n", topoSpec->nglob(0), topoSpec->nglob(1), topoSpec->nglob(2));
+        printf("[FLUPS] topo OUT loc  : nmem: %d*%d*%d nf:%d (nloc: %d %d %d)  \n", topoSpec->nmem(0), topoSpec->nmem(1), topoSpec->nmem(2), topoSpec->nf(), topoSpec->nloc(0), topoSpec->nloc(1), topoSpec->nloc(2));
 
 #ifndef SKIP_P3D
-        printf("[P3DFFT++] topo IN glob  : %d %d %d  \n",gdimsIN[0],gdimsIN[1],gdimsIN[2]);
-        printf("[P3DFFT++] topo IN loc   : %d %d %d (is: %d %d %d) \n",P3DnlocIN[0],P3DnlocIN[1],P3DnlocIN[2],glob_startIN[0],glob_startIN[1],glob_startIN[2]);
-        printf("[P3DFFT++] topo OUT glob : %d %d %d  \n",gdimsOUT[0],gdimsOUT[1],gdimsOUT[2]);
-        printf("[P3DFFT++] topo OUT loc  : %d %d %d (is: %d %d %d) \n",P3DnlocOUT[0],P3DnlocOUT[1],P3DnlocOUT[2],glob_startOUT[0],glob_startOUT[1],glob_startOUT[2]);
+        printf("[P3DFFT++] topo IN glob  : %d %d %d  \n", gdimsIN[0], gdimsIN[1], gdimsIN[2]);
+        printf("[P3DFFT++] topo IN loc   : %d %d %d (is: %d %d %d) \n", P3DnlocIN[0], P3DnlocIN[1], P3DnlocIN[2], glob_startIN[0], glob_startIN[1], glob_startIN[2]);
+        printf("[P3DFFT++] topo OUT glob : %d %d %d  \n", gdimsOUT[0], gdimsOUT[1], gdimsOUT[2]);
+        printf("[P3DFFT++] topo OUT loc  : %d %d %d (is: %d %d %d) \n", P3DnlocOUT[0], P3DnlocOUT[1], P3DnlocOUT[2], glob_startOUT[0], glob_startOUT[1], glob_startOUT[2]);
 #endif
 
-        printf("I am going to allocate FLUPS: %d (inside FLUPS: %d)\n",FLUmemsizeIN,FLUmemsizeOUT);
-#ifndef SKIP_P3D    
-        printf("                        P3D: %d (out %d C) \n",P3DmemsizeIN,P3DmemsizeOUT);
+        printf("I am going to allocate FLUPS: %d (inside FLUPS: %d)\n", FLUmemsizeIN, FLUmemsizeOUT);
+#ifndef SKIP_P3D
+        printf("                        P3D: %d (out %d C) \n", P3DmemsizeIN, P3DmemsizeOUT);
 #endif
     }
-    
- 
-    double *rhsFLU   = (double *)fftw_malloc(sizeof(double) * FLUmemsizeIN);
-    //solFLU   allocated by sflups setup with the larger size
-    std::memset(rhsFLU, 0, sizeof(double ) * FLUmemsizeIN);
-    std::memset(solFLU, 0, sizeof(double ) * FLUmemsizeOUT); 
-
-#ifndef SKIP_P3D 
-    double *rhsP3D   = (double *)fftw_malloc(sizeof(double) * P3DmemsizeIN);
-    p3dfft::complex_double *solP3D   = (p3dfft::complex_double *)fftw_malloc(sizeof(p3dfft::complex_double) * P3DmemsizeOUT);
-    std::memset(rhsP3D, 0, sizeof(double ) * P3DmemsizeIN);
+
+    double *rhsFLU = (double *)fftw_malloc(sizeof(double) * FLUmemsizeIN);
+    // solFLU   allocated by sflups setup with the larger size
+    std::memset(rhsFLU, 0, sizeof(double) * FLUmemsizeIN);
+    std::memset(solFLU, 0, sizeof(double) * FLUmemsizeOUT);
+
+#ifndef SKIP_P3D
+    double                 *rhsP3D = (double *)fftw_malloc(sizeof(double) * P3DmemsizeIN);
+    p3dfft::complex_double *solP3D = (p3dfft::complex_double *)fftw_malloc(sizeof(p3dfft::complex_double) * P3DmemsizeOUT);
+    std::memset(rhsP3D, 0, sizeof(double) * P3DmemsizeIN);
     std::memset(solP3D, 0, sizeof(p3dfft::complex_double) * P3DmemsizeOUT);
 #endif
-    
+
     // printf("istart: %d %d %d =? %d %d %d(P3D)\n",istartGlo[0],istartGlo[1],istartGlo[2],glob_startIN[0],glob_startIN[1],glob_startIN[2]);
 
-    double f = 1; //frequency of the wave
+    double f = 1;  // frequency of the wave
 
     for (int i2 = 0; i2 < topoIn->nloc(2); i2++) {
         for (int i1 = 0; i1 < topoIn->nloc(1); i1++) {
             for (int i0 = 0; i0 < topoIn->nloc(0); i0++) {
-                double       x    = (istartGlo[0] + i0 ) * h[0]; //+ 0.5
-                double       y    = (istartGlo[1] + i1 ) * h[1]; //+ 0.5
-                double       z    = (istartGlo[2] + i2 ) * h[2]; //+ 0.5
+                double x = (istartGlo[0] + i0) * h[0];  //+ 0.5
+                double y = (istartGlo[1] + i1) * h[1];  //+ 0.5
+                double z = (istartGlo[2] + i2) * h[2];  //+ 0.5
 
-                // double       xF   = (istartGlo[0] + i0 + .5 ) * h[0]; 
+                // double       xF   = (istartGlo[0] + i0 + .5 ) * h[0];
 
                 size_t id;
-                id    = localIndex(0, i0, i1, i2, 0, FLUnmemIn, 1, 0);
+                id         = localIndex(0, i0, i1, i2, 0, FLUnmemIn, 1, 0);
                 rhsFLU[id] = sin((c_2pi / L[0] * f) * x) * sin((c_2pi / L[1] * f) * y) * sin((c_2pi / L[2] * f) * z);
 
-#ifndef SKIP_P3D                 
-                //p3d does not care of the size you allocate, juste fill the first isize elements
-                id =  localIndex(0, i0, i1, i2, 0, P3DnlocIN, 1, 0);
+#ifndef SKIP_P3D
+                // p3d does not care of the size you allocate, juste fill the first isize elements
+                id         = localIndex(0, i0, i1, i2, 0, P3DnlocIN, 1, 0);
                 rhsP3D[id] = sin((c_2pi / L[0] * f) * x) * sin((c_2pi / L[1] * f) * y) * sin((c_2pi / L[2] * f) * z);
 #endif
             }
@@ -331,114 +334,112 @@ int main(int argc, char *argv[]) {
     // //-------------------------------------------------------------------------
     // /** - Proceed to the solve */
     // //-------------------------------------------------------------------------
-    
-       
-    double factor = 1.0/(nglob[0]* nglob[1]*nglob[2]);
-    //Warmup
-    // trans_f.exec(rhsP3D,solP3D,false);
-
-    for (int iter=0;iter<n_iter;iter++){
-        
+
+    double factor = 1.0 / (nglob[0] * nglob[1] * nglob[2]);
+    // Warmup
+    //  trans_f.exec(rhsP3D,solP3D,false);
+
+    for (int iter = 0; iter < n_iter; iter++) {
         MPI_Barrier(comm);
 
-#ifndef SKIP_P3D  
+#ifndef SKIP_P3D
         // ------------------P3D---------------:
-        if(rank==0){
-            printf("Iter %i P3D\n",iter);
+        if (rank == 0) {
+            printf("Iter %i P3D\n", iter);
         }
         fflush(stdout);
 
         P3Dprof->start("FFTandSwitch");
-        trans_f.exec(rhsP3D,solP3D,false);  // Execute forward real-to-complex FFT
+        trans_f.exec(rhsP3D, solP3D, false);  // Execute forward real-to-complex FFT
         P3Dprof->stop("FFTandSwitch");
 
 // #define PRINT_RES
 #ifdef PRINT_RES
         /* normalize */
-        for(int id = 0; id<P3DmemsizeOUT; id++){
+        for (int id = 0; id < P3DmemsizeOUT; id++) {
             solP3D[id] *= factor;
         }
-        print_res(solP3D,P3DnlocOUT,glob_startOUT);
+        print_res(solP3D, P3DnlocOUT, glob_startOUT);
 #endif
 #endif
 
         // ------------------FLUPS---------------:
         MPI_Barrier(comm);
-        if(rank==0){
-            printf("Iter %i FLUPS\n",iter);
+        if (rank == 0) {
+            printf("Iter %i FLUPS\n", iter);
         }
         fflush(stdout);
 
-        //reinit sol to prepare for inplace 3D FFT
-        std::memcpy(solFLU, rhsFLU, sizeof(double ) * FLUmemsizeIN);
+        // reinit sol to prepare for inplace 3D FFT
+        std::memcpy(solFLU, rhsFLU, sizeof(double) * FLUmemsizeIN);
 
         FLUprof->start("solve");
-        mysolver->do_FFT(solFLU,FLUPS_FORWARD);
+        mysolver->do_FFT(solFLU, FLUPS_FORWARD);
         FLUprof->stop("solve");
 
 #ifdef PRINT_RES
         /* normalize*/
-        for(int id = 0; id<FLUmemsizeOUT; id++){
+        for (int id = 0; id < FLUmemsizeOUT; id++) {
             solFLU[id] *= factor;
         }
-        print_res(solFLU,topoSpec);
+        print_res(solFLU, topoSpec);
 #endif
-        //Note: if we do several FFT in a raw, the results are wrong with FLUPS for iter>=1. This is because
-        // FLUPS would require to do the backward transform before doing a new forward transform (or a reset
-        // function, which we choose not to implement).
+        // Note: if we do several FFT in a raw, the results are wrong with FLUPS for iter>=1. This is because
+        //  FLUPS would require to do the backward transform before doing a new forward transform (or a reset
+        //  function, which we choose not to implement).
     }
 
     // //-------------------------------------------------------------------------
     // /** - get timings */
     // //-------------------------------------------------------------------------
-    
+
     // --- FLUPS -------
     FLUprof->disp("solve");
-    delete(FLUprof);
+    delete (FLUprof);
 
-#ifndef SKIP_P3D  
+#ifndef SKIP_P3D
     // --- P3DFFT -------
 #ifdef P3DMODIF
     double times[8][3];
-    p3dfft::timers.get(times,comm);
-    string P3DNames[8] = {"Reorder_trans","Reorder_out","Reorder_in","Trans_exec","Packsend","Packsend_trans","Unpackrecv","Alltoall"};
+    p3dfft::timers.get(times, comm);
+    string P3DNames[8] = {"Reorder_trans", "Reorder_out", "Reorder_in", "Trans_exec", "Packsend", "Packsend_trans", "Unpackrecv", "Alltoall"};
 #endif
 
-    double timeRef      = P3Dprof->get_timeAcc("FFTandSwitch");
-    double timeInit     = P3Dprof->get_timeAcc("init");
+    double timeRef  = P3Dprof->get_timeAcc("FFTandSwitch");
+    double timeInit = P3Dprof->get_timeAcc("init");
 
     P3Dprof->disp("FFTandSwitch");
-#ifdef P3DMODIF    
-    if(rank == 0) {
+#ifdef P3DMODIF
+    if (rank == 0) {
         // printf("===================================================================================================================================================\n");
         // printf("          TIMER P3DFFT %s\n",P3DFFTprof.c_str());
         // printf("%25s|  %-13s\t%-13s\t%-13s\t%-13s\t%-13s\t%-13s\t%-13s\t%-13s\t%-13s\n","-NAME-    ", "-% global-", "-% local-", "-Total time-", "-Self time-", "-time/call-", "-Min time-", "-Max time-","-Mean cnt-","-(MB/s)-");
 
         string filename = "prof/" + P3DFFTprof + "_time.csv";
-        FILE* file      = fopen(filename.c_str(), "w+");
-    
-        printf("%-25.25s|  %9.4f\t%9.4f\t%9.6f\t%9.6f\t%9.6f\t%9.6f\t%9.6f\t%09.1f\t%9.2f\n", "init", 100*timeInit/timeRef, 100*timeInit/(timeRef+timeInit), timeInit, 0., timeInit/n_iter, 0., 0.,(double) n_iter,0.);
-        fprintf(file, "%s;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.0f;%09.2f\n", "init", 100*timeInit/timeRef, 100*timeInit/(timeRef+timeInit), timeInit, 0., timeInit/n_iter, 0., 0.,(double) n_iter,0);
-        printf("%-25.25s|  %9.4f\t%9.4f\t%9.6f\t%9.6f\t%9.6f\t%9.6f\t%9.6f\t%09.1f\t%9.2f\n", "FFTandSwitch", 100., 100*timeRef/(timeRef+timeInit), timeRef, 0., timeRef/n_iter, 0., 0., (double) n_iter,0.);
-        fprintf(file, "%s;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.0f;%09.2f\n", "FFTandSwitch", 100., 100*timeRef/(timeRef+timeInit), timeRef, 0., timeRef/n_iter, 0., 0., (double) n_iter,0.);
-        for(int i=0; i<8; i++){
+        FILE  *file     = fopen(filename.c_str(), "w+");
+
+        printf("%-25.25s|  %9.4f\t%9.4f\t%9.6f\t%9.6f\t%9.6f\t%9.6f\t%9.6f\t%09.1f\t%9.2f\n", "init", 100 * timeInit / timeRef, 100 * timeInit / (timeRef + timeInit), timeInit, 0., timeInit / n_iter, 0., 0., (double)n_iter, 0.);
+        fprintf(file, "%s;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.0f;%09.2f\n", "init", 100 * timeInit / timeRef, 100 * timeInit / (timeRef + timeInit), timeInit, 0., timeInit / n_iter, 0., 0., (double)n_iter, 0);
+        printf("%-25.25s|  %9.4f\t%9.4f\t%9.6f\t%9.6f\t%9.6f\t%9.6f\t%9.6f\t%09.1f\t%9.2f\n", "FFTandSwitch", 100., 100 * timeRef / (timeRef + timeInit), timeRef, 0., timeRef / n_iter, 0., 0., (double)n_iter, 0.);
+        fprintf(file, "%s;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.0f;%09.2f\n", "FFTandSwitch", 100., 100 * timeRef / (timeRef + timeInit), timeRef, 0., timeRef / n_iter, 0., 0., (double)n_iter, 0.);
+        for (int i = 0; i < 8; i++) {
             int j = i;
-            printf("%-25.25s|  %9.4f\t%9.4f\t%9.6f\t%9.6f\t%9.6f\t%9.6f\t%9.6f\t%09.1f\t%9.2f\n", ("-- "+P3DNames[j]).c_str(), 100.*times[j][0]/timeRef, 100*times[j][0]/timeRef, times[j][0], 0., times[j][0]/n_iter, times[j][1]/n_iter, times[j][2]/n_iter, (double) n_iter,0.);
-            fprintf(file, "%s;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.0f;%09.2f\n", ("-- "+P3DNames[j]).c_str(), 100.*times[j][0]/timeRef, 100*times[j][0]/timeRef, times[j][0], 0., times[j][0]/n_iter, times[j][1]/n_iter, times[j][2]/n_iter, (double) n_iter,0.);
+            printf("%-25.25s|  %9.4f\t%9.4f\t%9.6f\t%9.6f\t%9.6f\t%9.6f\t%9.6f\t%09.1f\t%9.2f\n", ("-- " + P3DNames[j]).c_str(), 100. * times[j][0] / timeRef, 100 * times[j][0] / timeRef, times[j][0], 0., times[j][0] / n_iter, times[j][1] / n_iter, times[j][2] / n_iter, (double)n_iter, 0.);
+            fprintf(file, "%s;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.0f;%09.2f\n", ("-- " + P3DNames[j]).c_str(), 100. * times[j][0] / timeRef, 100 * times[j][0] / timeRef, times[j][0], 0., times[j][0] / n_iter, times[j][1] / n_iter, times[j][2] / n_iter, (double)n_iter, 0.);
         }
         fclose(file);
-    }   
+    }
 #else
     p3dfft::timers.print(comm);
 #endif
-    delete(P3Dprof);
+    delete (P3Dprof);
 #endif
 
-    if(rank==0)
+    if (rank == 0)
         printf("Done ! Now let's clean...\n");
     fflush(stdout);
 
-#ifndef SKIP_P3D      
+#ifndef SKIP_P3D
     fftw_free(solP3D);
     fftw_free(rhsP3D);
 #endif
@@ -447,36 +448,36 @@ int main(int argc, char *argv[]) {
     topoIn->~Topology();
     topoSpec->~Topology();
     mysolver->~Solver();
-#ifndef SKIP_P3D      
+#ifndef SKIP_P3D
     p3dfft::cleanup();
 #endif
 
+    flups_cleanup_fftw();
     MPI_Finalize();
 }
 
-void print_res(double *A, const Topology* topo) {
-    const int ax0     = topo->axis();
-    const int ax1     = (ax0 + 1) % 3;
-    const int ax2     = (ax0 + 2) % 3;
-    const int nf = 2;//topo->nf()
-    int nmem[3];
-    
-    for(int i=0;i<3;i++){
+void print_res(double *A, const Topology *topo) {
+    const int ax0 = topo->axis();
+    const int ax1 = (ax0 + 1) % 3;
+    const int ax2 = (ax0 + 2) % 3;
+    const int nf  = 2;  // topo->nf()
+    int       nmem[3];
+
+    for (int i = 0; i < 3; i++) {
         nmem[i] = topo->nmem(i);
     }
 
     int gstart[3];
     topo->get_istart_glob(gstart);
 
-    if (topo->isComplex()){
+    if (topo->isComplex()) {
         for (int i2 = 0; i2 < topo->nloc(ax2); i2++) {
             for (int i1 = 0; i1 < topo->nloc(ax1); i1++) {
-                //local indexes start
+                // local indexes start
                 const size_t id = localIndex(ax0, 0, i1, i2, ax0, nmem, 2, 0);
                 for (int i0 = 0; i0 < topo->nloc(ax0); i0++) {
-                    
-                    if (std::fabs(A[id+i0*2]) + std::fabs(A[id+i0*2 + 1]) > 1.25e-4) {
-                        printf("FLU(%d %d %d) %lg +i1* %lg\n", i0 + gstart[ax0], i1 + gstart[ax1], i2 + gstart[ax2], A[id+i0*2], A[id+i0*2 + 1]);
+                    if (std::fabs(A[id + i0 * 2]) + std::fabs(A[id + i0 * 2 + 1]) > 1.25e-4) {
+                        printf("FLU(%d %d %d) %lg +i1* %lg\n", i0 + gstart[ax0], i1 + gstart[ax1], i2 + gstart[ax2], A[id + i0 * 2], A[id + i0 * 2 + 1]);
                     }
                 }
             }
@@ -484,12 +485,11 @@ void print_res(double *A, const Topology* topo) {
     } else {
         for (int i2 = 0; i2 < topo->nloc(ax2); i2++) {
             for (int i1 = 0; i1 < topo->nloc(ax1); i1++) {
-                //local indexes start
+                // local indexes start
                 const size_t id = localIndex(ax0, 0, i1, i2, ax0, nmem, 1, 0);
                 for (int i0 = 0; i0 < topo->nloc(ax0); i0++) {
-                    
-                    if (std::fabs(A[id+i0]) > 1.25e-4) {
-                        printf("FLU(%d %d %d) %lg \n", i0 + gstart[ax0], i1 + gstart[ax1], i2 + gstart[ax2], A[id+i0]);
+                    if (std::fabs(A[id + i0]) > 1.25e-4) {
+                        printf("FLU(%d %d %d) %lg \n", i0 + gstart[ax0], i1 + gstart[ax1], i2 + gstart[ax2], A[id + i0]);
                     }
                 }
             }
@@ -497,18 +497,17 @@ void print_res(double *A, const Topology* topo) {
     }
 }
 
-void print_res(p3dfft::complex_double *A,int *sdims,int *gstart)
-{
-  int x,y,z;
-  p3dfft::complex_double *p;
-  int imo[3],i,j;
-  p = A;
-
-  for(z=0;z < sdims[2];z++)
-    for(y=0;y < sdims[1];y++)
-      for(x=0;x < sdims[0];x++) {
-        if(std::fabs(p->real())+std::fabs(p->imag()) > 1.25e-4) 
-            printf("P3D(%d %d %d) %lg +i1* %lg\n",x+gstart[0],y+gstart[1],z+gstart[2],p->real(),p->imag());
-        p++;
-      }
-}
\ No newline at end of file
+void print_res(p3dfft::complex_double *A, int *sdims, int *gstart) {
+    int                     x, y, z;
+    p3dfft::complex_double *p;
+    int                     imo[3], i, j;
+    p = A;
+
+    for (z = 0; z < sdims[2]; z++)
+        for (y = 0; y < sdims[1]; y++)
+            for (x = 0; x < sdims[0]; x++) {
+                if (std::fabs(p->real()) + std::fabs(p->imag()) > 1.25e-4)
+                    printf("P3D(%d %d %d) %lg +i1* %lg\n", x + gstart[0], y + gstart[1], z + gstart[2], p->real(), p->imag());
+                p++;
+            }
+}
diff --git a/samples/compareP3DFFT/main_compare.cpp b/samples/compareP3DFFT/main_compare.cpp
index fd4e365..178c965 100644
--- a/samples/compareP3DFFT/main_compare.cpp
+++ b/samples/compareP3DFFT/main_compare.cpp
@@ -8,65 +8,62 @@
 #include "Solver.hpp"
 #include "p3dfft.h"
 
-void print_all_P3D(double *A,long int nar);
-void print_res(double *A, const Topology* topo);
+void print_all_P3D(double *A, long int nar);
+void print_res(double *A, const Topology *topo);
 
 int main(int argc, char *argv[]) {
-    
     //-------------------------------------------------------------------------
     // Initialize MPI
     //-------------------------------------------------------------------------
-    int rank, comm_size;
+    int      rank, comm_size;
     MPI_Comm comm = MPI_COMM_WORLD;
 
     int provided;
     // set MPI_THREAD_FUNNELED or MPI_THREAD_SERIALIZED
     int requested = MPI_THREAD_FUNNELED;
     MPI_Init_thread(&argc, &argv, requested, &provided);
-    if(provided < requested){
+    if (provided < requested) {
         FLUPS_ERROR("The MPI-provided thread behavior does not match");
     }
-   
+
     MPI_Comm_rank(comm, &rank);
     MPI_Comm_size(comm, &comm_size);
 
     //-------------------------------------------------------------------------
-    //Definition of the problem
+    // Definition of the problem
     //-------------------------------------------------------------------------
-    const int     nglob[3] = {16, 16, 16};
-    const int     nproc[3] = {1, 1, 2}; //nproc[0] has to be 1 //CAUTION FOR THIS: nproc[1]<=nproc[2] !!!
-    const double  L[3]     = {1., 1., 1.};
+    const int    nglob[3] = {16, 16, 16};
+    const int    nproc[3] = {1, 1, 2};  // nproc[0] has to be 1 //CAUTION FOR THIS: nproc[1]<=nproc[2] !!!
+    const double L[3]     = {1., 1., 1.};
 
     const double h[3] = {L[0] / nglob[0], L[1] / nglob[1], L[2] / nglob[2]};
-    
+
     const FLUPS_CenterType center_type[3] = {CELL_CENTER, CELL_CENTER, CELL_CENTER};
 
-    FLUPS_BoundaryType* mybc[3][2];
-    for(int id=0; id<3; id++){
-        for(int is=0; is<2; is++){
-            mybc[id][is] = (FLUPS_BoundaryType*) flups_malloc(sizeof(int)*1);
+    FLUPS_BoundaryType *mybc[3][2];
+    for (int id = 0; id < 3; id++) {
+        for (int is = 0; is < 2; is++) {
+            mybc[id][is]    = (FLUPS_BoundaryType *)flups_malloc(sizeof(int) * 1);
             mybc[id][is][0] = PER;
         }
     }
 
-    unsigned char op_f[]="fft", op_b[]="tff";
+    unsigned char op_f[] = "fft", op_b[] = "tff";
 
     const int n_iter = 100;
 
-
-
     //-------------------------------------------------------------------------
     /** - Initialize FLUPS */
     //-------------------------------------------------------------------------
 
     // create a real topology
-    int FLUnmemIn[3],FLUnmemOUT[3];
-    FLUPS_Topology *topoIn = new FLUPS_Topology(0, 1, nglob, nproc, false, NULL, FLUPS_ALIGNMENT, comm);
-    const int  nprocOut[3] = {1, 2, 1};
-    const int  nglobOut[3] = {17, 32, 64};
-    
-    std::string FLUPSprof = "compare_FLUPS_res" + std::to_string((int)(nglob[0]/L[0])) + "_nrank" + std::to_string(comm_size)+"_nthread" + std::to_string(omp_get_max_threads());
-    Profiler* FLUprof = new Profiler(FLUPSprof);
+    int             FLUnmemIn[3], FLUnmemOUT[3];
+    FLUPS_Topology *topoIn      = new FLUPS_Topology(0, 1, nglob, nproc, false, NULL, FLUPS_ALIGNMENT, comm);
+    const int       nprocOut[3] = {1, 2, 1};
+    const int       nglobOut[3] = {17, 32, 64};
+
+    std::string FLUPSprof = "compare_FLUPS_res" + std::to_string((int)(nglob[0] / L[0])) + "_nrank" + std::to_string(comm_size) + "_nthread" + std::to_string(omp_get_max_threads());
+    Profiler   *FLUprof   = new Profiler(FLUPSprof);
 
     FLUPS_Solver *mysolver = new FLUPS_Solver(topoIn, mybc, h, L, NOD, center_type, FLUprof);
 
@@ -77,13 +74,13 @@ int main(int argc, char *argv[]) {
     comm = flups_topo_get_comm(topoIn);
     MPI_Comm_rank(comm, &rank);
 
-    const Topology *topoSpec    = mysolver->get_innerTopo_spectral();
+    const Topology *topoSpec = mysolver->get_innerTopo_spectral();
 
     for (int i = 0; i < 3; i++) {
         FLUnmemIn[i]  = topoIn->nmem(i);
         FLUnmemOUT[i] = topoSpec->nmem(i);
     }
-    
+
     int istartGloOut[3], istartGlo[3];
     topoIn->get_istart_glob(istartGlo);
     topoSpec->get_istart_glob(istartGloOut);
@@ -96,17 +93,17 @@ int main(int argc, char *argv[]) {
     //-------------------------------------------------------------------------
 
     int conf;
-    int dims[2] = {nproc[1],nproc[2]};
+    int dims[2] = {nproc[1], nproc[2]};
     int P3Dmemsize[3];
-    int istart[3],isize[3],iend[3];
-    int fstart[3],fsize[3],fend[3];
-    int tstart[3],tsize[3],tend[3];
+    int istart[3], isize[3], iend[3];
+    int fstart[3], fsize[3], fend[3];
+    int tstart[3], tsize[3], tend[3];
 
     if (rank == 0)
         printf("Using processor grid %d x %d with pencils in x\n", dims[0], dims[1]);
 
-    std::string P3DFFTprof = "compare_P3DFFT_res" + std::to_string((int)(nglob[0]/L[0])) + "_nrank" + std::to_string(comm_size)+"_nthread" + std::to_string(omp_get_max_threads());
-    Profiler* P3Dprof = new Profiler(P3DFFTprof);
+    std::string P3DFFTprof = "compare_P3DFFT_res" + std::to_string((int)(nglob[0] / L[0])) + "_nrank" + std::to_string(comm_size) + "_nthread" + std::to_string(omp_get_max_threads());
+    Profiler   *P3Dprof    = new Profiler(P3DFFTprof);
 
     P3Dprof->create("init", "root");
     P3Dprof->create("FFTandSwitch", "root");
@@ -115,76 +112,72 @@ int main(int argc, char *argv[]) {
     P3Dprof->start("init");
     Cp3dfft_setup(dims, nglob[0], nglob[1], nglob[2], MPI_Comm_c2f(comm), nglob[0], nglob[1], nglob[2], 1, P3Dmemsize);
     P3Dprof->stop("init");
-    
+
     /* Get dimensions for input array - real numbers, X-pencil shape.
       Note that we are following the Fortran ordering, i.e.
       the dimension  with stride-1 is X. */
-    //really? well x seems to be the last dimension...
+    // really? well x seems to be the last dimension...
     conf = 1;
     Cp3dfft_get_dims(istart, iend, isize, conf);
-    
+
     /* Get dimensions for output array - complex numbers, Z-pencil shape.
       Stride-1 dimension could be X or Z, depending on how the library
       was compiled (stride1 option) */
     conf = 2;
     Cp3dfft_get_dims(fstart, fend, fsize, conf);
-    
+
     /* Get what you should allocate in place, in double, per proc. Large enough for padding */
     conf = 3;
     Cp3dfft_get_dims(tstart, tend, tsize, conf);
 
-
-
     // //-------------------------------------------------------------------------
     // /** - allocate rhs and solution */
     // //-------------------------------------------------------------------------
-    printf("[FLUPS] topo IN glob : %d %d %d \n",topoIn->nglob(0),topoIn->nglob(1),topoIn->nglob(2));
-    printf("[FLUPS] topo IN loc : %d*%d*%d = %d (check: %d %d %d)\n",topoIn->nmem(0),topoIn->nmem(1),topoIn->nmem(2),topoIn->memsize(),topoIn->nloc(0),topoIn->nloc(1),topoIn->nloc(2));
-    printf("[FLUPS] topo OUT glob : %d %d %d \n",topoSpec->nglob(0),topoSpec->nglob(1),topoSpec->nglob(2));
-    printf("[FLUPS] topo OUT loc  : nmem: %d*%d*%d nf:%d (nloc: %d %d %d)  \n",topoSpec->nmem(0),topoSpec->nmem(1),topoSpec->nmem(2),topoSpec->nf(),topoSpec->nloc(0),topoSpec->nloc(1),topoSpec->nloc(2));
-
-    printf("[P3DFFT] topo 0 loc : %d %d %d  - (is: %d %d %d ie: %d %d %d size: %d %d %d) \n",P3Dmemsize[0],P3Dmemsize[1],P3Dmemsize[2],istart[0],istart[1],istart[2], iend[0],iend[1],iend[2], isize[0],isize[1],isize[2]);
-    printf("[P3DFFT] topo 2 loc : %d %d %d  - (is: %d %d %d ie: %d %d %d) \n",fsize[0],fsize[1],fsize[2], fstart[0],fstart[1],fstart[2], fend[0],fend[1],fend[2]);
-    printf("[P3DFFT] for alloc: %d %d %d  =? %d %d %d \n",P3Dmemsize[0],P3Dmemsize[1],P3Dmemsize[2], tsize[0],tsize[1],tsize[2]);
+    printf("[FLUPS] topo IN glob : %d %d %d \n", topoIn->nglob(0), topoIn->nglob(1), topoIn->nglob(2));
+    printf("[FLUPS] topo IN loc : %d*%d*%d = %d (check: %d %d %d)\n", topoIn->nmem(0), topoIn->nmem(1), topoIn->nmem(2), topoIn->memsize(), topoIn->nloc(0), topoIn->nloc(1), topoIn->nloc(2));
+    printf("[FLUPS] topo OUT glob : %d %d %d \n", topoSpec->nglob(0), topoSpec->nglob(1), topoSpec->nglob(2));
+    printf("[FLUPS] topo OUT loc  : nmem: %d*%d*%d nf:%d (nloc: %d %d %d)  \n", topoSpec->nmem(0), topoSpec->nmem(1), topoSpec->nmem(2), topoSpec->nf(), topoSpec->nloc(0), topoSpec->nloc(1), topoSpec->nloc(2));
+
+    printf("[P3DFFT] topo 0 loc : %d %d %d  - (is: %d %d %d ie: %d %d %d size: %d %d %d) \n", P3Dmemsize[0], P3Dmemsize[1], P3Dmemsize[2], istart[0], istart[1], istart[2], iend[0], iend[1], iend[2], isize[0], isize[1], isize[2]);
+    printf("[P3DFFT] topo 2 loc : %d %d %d  - (is: %d %d %d ie: %d %d %d) \n", fsize[0], fsize[1], fsize[2], fstart[0], fstart[1], fstart[2], fend[0], fend[1], fend[2]);
+    printf("[P3DFFT] for alloc: %d %d %d  =? %d %d %d \n", P3Dmemsize[0], P3Dmemsize[1], P3Dmemsize[2], tsize[0], tsize[1], tsize[2]);
     // => P3Dmemsize = tsize
 
-
     // int nm = isize[0]*isize[1]*isize[2];
     // printf("isize: %d %d %d - fsize: %d %d %d\n", isize[0],isize[1],isize[2], fsize[0],fsize[1],fsize[2]);
     // if(nm < fsize[0]*fsize[1]*fsize[2]*2){ //switch to complex?
     //     nm = fsize[0]*fsize[1]*fsize[2]*2;
     // }
-    int nm = tsize[0]*tsize[1]*tsize[2]; //this is in double
-    printf("I am going to allocate FLUPS: %d (out %d R) , P3D: %d (out %d C) \n",FLUmemsizeIN,FLUmemsizeOUT,nm, tsize[0]*tsize[1]*tsize[2]);
-    
- 
-    double *rhsFLU   = (double *)fftw_malloc(sizeof(double) * FLUmemsizeIN);
+    int nm = tsize[0] * tsize[1] * tsize[2];  // this is in double
+    printf("I am going to allocate FLUPS: %d (out %d R) , P3D: %d (out %d C) \n", FLUmemsizeIN, FLUmemsizeOUT, nm, tsize[0] * tsize[1] * tsize[2]);
+
+    double *rhsFLU = (double *)fftw_malloc(sizeof(double) * FLUmemsizeIN);
     // solFLU   allocated by setup
-    double *rhsP3D   = (double *)fftw_malloc(sizeof(double) * nm);
-    double *solP3D   = (double *)fftw_malloc(sizeof(double) * nm);
+    double *rhsP3D = (double *)fftw_malloc(sizeof(double) * nm);
+    double *solP3D = (double *)fftw_malloc(sizeof(double) * nm);
 
-    std::memset(rhsFLU, 0, sizeof(double ) * FLUmemsizeIN);
-    std::memset(solFLU, 0, sizeof(double ) * FLUmemsizeOUT); 
-    std::memset(rhsP3D, 0, sizeof(double ) * nm);
-    std::memset(solP3D, 0, sizeof(double ) * nm);
-    
-    printf("istart: %d %d %d =? %d %d %d(P3D)\n",istartGlo[0],istartGlo[1],istartGlo[2],istart[0],istart[1],istart[2]);
+    std::memset(rhsFLU, 0, sizeof(double) * FLUmemsizeIN);
+    std::memset(solFLU, 0, sizeof(double) * FLUmemsizeOUT);
+    std::memset(rhsP3D, 0, sizeof(double) * nm);
+    std::memset(solP3D, 0, sizeof(double) * nm);
+
+    printf("istart: %d %d %d =? %d %d %d(P3D)\n", istartGlo[0], istartGlo[1], istartGlo[2], istart[0], istart[1], istart[2]);
 
     double f = 1;
 
     for (int i2 = 0; i2 < topoIn->nloc(2); i2++) {
         for (int i1 = 0; i1 < topoIn->nloc(1); i1++) {
             for (int i0 = 0; i0 < topoIn->nloc(0); i0++) {
-                double       x    = (istartGlo[0] + i0 ) * h[0]; //+ 0.5
-                double       y    = (istartGlo[1] + i1 ) * h[1]; //+ 0.5
-                double       z    = (istartGlo[2] + i2 ) * h[2]; //+ 0.5
+                double x = (istartGlo[0] + i0) * h[0];  //+ 0.5
+                double y = (istartGlo[1] + i1) * h[1];  //+ 0.5
+                double z = (istartGlo[2] + i2) * h[2];  //+ 0.5
 
                 size_t id;
                 id         = localIndex(0, i0, i1, i2, 0, FLUnmemIn, 1, 0);
                 rhsFLU[id] = sin((c_2pi / L[0] * f) * x) * sin((c_2pi / L[1] * f) * y) * sin((c_2pi / L[2] * f) * z);
-                
-                //p3d does not care of the size you allocate, juste fill the first isize elements
-                id =  localIndex(0, i0, i1, i2, 0, isize, 1, 0);
+
+                // p3d does not care of the size you allocate, juste fill the first isize elements
+                id         = localIndex(0, i0, i1, i2, 0, isize, 1, 0);
                 rhsP3D[id] = sin((c_2pi / L[0] * f) * x) * sin((c_2pi / L[1] * f) * y) * sin((c_2pi / L[2] * f) * z);
             }
         }
@@ -193,72 +186,70 @@ int main(int argc, char *argv[]) {
     // //-------------------------------------------------------------------------
     // /** - Proceed to the solve */
     // //-------------------------------------------------------------------------
-    
-    int Ntot = fsize[0]*fsize[1];
-    Ntot *= fsize[2]*2; //the number of real corresponding to the complex numbers
-    
-    double factor = 1.0/(nglob[0]* nglob[1]*nglob[2]);
-
-    for (int iter=0;iter<n_iter;iter++){
-        
+
+    int Ntot = fsize[0] * fsize[1];
+    Ntot *= fsize[2] * 2;  // the number of real corresponding to the complex numbers
+
+    double factor = 1.0 / (nglob[0] * nglob[1] * nglob[2]);
+
+    for (int iter = 0; iter < n_iter; iter++) {
         MPI_Barrier(comm);
-        if(rank==0){
-            printf("Iter %i\n",iter);
+        if (rank == 0) {
+            printf("Iter %i\n", iter);
         }
         fflush(stdout);
 
         // ------------------P3D---------------:
 
         P3Dprof->start("FFTandSwitch");
-        Cp3dfft_ftran_r2c(rhsP3D,solP3D,op_f);
+        Cp3dfft_ftran_r2c(rhsP3D, solP3D, op_f);
         P3Dprof->stop("FFTandSwitch");
 
-//#define PRINT_RES
+// #define PRINT_RES
 #ifdef PRINT_RES
         /* normallize */
-        for(int id = 0; id<Ntot; id++){
+        for (int id = 0; id < Ntot; id++) {
             solP3D[id] *= factor;
-        }   
-        print_all_P3D(solP3D,Ntot);
+        }
+        print_all_P3D(solP3D, Ntot);
 #endif
 
         // ------------------FLUPS---------------:
         MPI_Barrier(comm);
 
-        std::memcpy(solFLU, rhsFLU, sizeof(double ) * FLUmemsizeIN);
-        mysolver->do_FFT(solFLU,FLUPS_FORWARD);
+        std::memcpy(solFLU, rhsFLU, sizeof(double) * FLUmemsizeIN);
+        mysolver->do_FFT(solFLU, FLUPS_FORWARD);
 
 #ifdef PRINT_RES
         /* normalize*/
-        for(int id = 0; id<FLUmemsizeOUT; id++){
+        for (int id = 0; id < FLUmemsizeOUT; id++) {
             solFLU[id] *= factor;
         }
-        print_res(solFLU,topoSpec);
+        print_res(solFLU, topoSpec);
 #endif
-        //Note: if we do several FFT in a raw, the results are wrong with FLUPS for iter>=1. This is because
-        // FLUPS would require to do the backward transform before doing a new forward transform (or a reset
-        // function, which we choose not to implement).
-
+        // Note: if we do several FFT in a raw, the results are wrong with FLUPS for iter>=1. This is because
+        //  FLUPS would require to do the backward transform before doing a new forward transform (or a reset
+        //  function, which we choose not to implement).
     }
 
     // //-------------------------------------------------------------------------
     // /** - get timings */
     // //-------------------------------------------------------------------------
-    
+
     // --- FLUPS -------
     FLUprof->disp("solve");
-    delete(FLUprof);
+    delete (FLUprof);
 
     // --- P3DFFT -------
     double timers[12], gtmean[12], gtmax[12], gtmin[12];
 
     get_timers(timers);
-    MPI_Reduce(&timers,&gtmean,12,MPI_DOUBLE,MPI_SUM,0,comm);
-    MPI_Reduce(&timers,&gtmax ,12,MPI_DOUBLE,MPI_MAX,0,comm);
-    MPI_Reduce(&timers,&gtmin ,12,MPI_DOUBLE,MPI_MIN,0,comm);
+    MPI_Reduce(&timers, &gtmean, 12, MPI_DOUBLE, MPI_SUM, 0, comm);
+    MPI_Reduce(&timers, &gtmax, 12, MPI_DOUBLE, MPI_MAX, 0, comm);
+    MPI_Reduce(&timers, &gtmin, 12, MPI_DOUBLE, MPI_MIN, 0, comm);
 
-    for (int i=0;i < 12;i++) {
-        gtmean[i] = gtmean[i]/ ((double) comm_size);
+    for (int i = 0; i < 12; i++) {
+        gtmean[i] = gtmean[i] / ((double)comm_size);
     }
 
     double timeRef      = P3Dprof->get_timeAcc("FFTandSwitch");
@@ -266,38 +257,37 @@ int main(int argc, char *argv[]) {
     string P3DNames[12] = {"All2All(v) 1", "All2All(v) 2", "?", "?", "fft 1", "reorder 1", "fft 2", "reorder+fft 3", "?", "?", "?", "?"};
     int    order[12]    = {5, 1, 6, 7, 2, 8, 3, 4, 9, 10, 11, 12};
 
-
     P3Dprof->disp("FFTandSwitch");
-    if(rank == 0) {
+    if (rank == 0) {
         // printf("===================================================================================================================================================\n");
         // printf("          TIMER P3DFFT %s\n",P3DFFTprof.c_str());
         // printf("%25s|  %-13s\t%-13s\t%-13s\t%-13s\t%-13s\t%-13s\t%-13s\t%-13s\t%-13s\n","-NAME-    ", "-% global-", "-% local-", "-Total time-", "-Self time-", "-time/call-", "-Min time-", "-Max time-","-Mean cnt-","-(MB/s)-");
 
         string filename = "prof/" + P3DFFTprof + "_time.csv";
-        FILE* file      = fopen(filename.c_str(), "w+");
-    
-        printf("%-25.25s|  %9.4f\t%9.4f\t%9.6f\t%9.6f\t%9.6f\t%9.6f\t%9.6f\t%09.1f\t%9.2f\n", "init", 100*timeInit/timeRef, 100*timeInit/(timeRef+timeInit), timeInit, 0., timeInit/n_iter, 0., 0.,(double) n_iter,0.);
-        fprintf(file, "%s;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.0f;%09.2f\n", "init", 100*timeInit/timeRef, 100*timeInit/(timeRef+timeInit), timeInit, 0., timeInit/n_iter, 0., 0.,(double) n_iter,0);
-        printf("%-25.25s|  %9.4f\t%9.4f\t%9.6f\t%9.6f\t%9.6f\t%9.6f\t%9.6f\t%09.1f\t%9.2f\n", "FFTandSwitch", 100., 100*timeRef/(timeRef+timeInit), timeRef, 0., timeRef/n_iter, 0., 0., (double) n_iter,0.);
-        fprintf(file, "%s;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.0f;%09.2f\n", "FFTandSwitch", 100., 100*timeRef/(timeRef+timeInit), timeRef, 0., timeRef/n_iter, 0., 0., (double) n_iter,0.);
-        for(int i=0; i<6; i++){
-            int j = order[i]-1;
-            printf("%-25.25s|  %9.4f\t%9.4f\t%9.6f\t%9.6f\t%9.6f\t%9.6f\t%9.6f\t%09.1f\t%9.2f\n", ("-- "+P3DNames[j]).c_str(), 100.*gtmean[j]/timeRef, 100*gtmean[j]/timeRef, gtmean[j], 0., gtmean[j]/n_iter, gtmin[j]/n_iter, gtmax[j]/n_iter, (double) n_iter,0.);
-            fprintf(file, "%s;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.0f;%09.2f\n", ("-- "+P3DNames[j]).c_str(), 100.*gtmean[j]/timeRef, 100*gtmean[j]/timeRef, gtmean[j], 0., gtmean[j]/n_iter, gtmin[j]/n_iter, gtmax[j]/n_iter, (double) n_iter,0.);
+        FILE  *file     = fopen(filename.c_str(), "w+");
+
+        printf("%-25.25s|  %9.4f\t%9.4f\t%9.6f\t%9.6f\t%9.6f\t%9.6f\t%9.6f\t%09.1f\t%9.2f\n", "init", 100 * timeInit / timeRef, 100 * timeInit / (timeRef + timeInit), timeInit, 0., timeInit / n_iter, 0., 0., (double)n_iter, 0.);
+        fprintf(file, "%s;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.0f;%09.2f\n", "init", 100 * timeInit / timeRef, 100 * timeInit / (timeRef + timeInit), timeInit, 0., timeInit / n_iter, 0., 0., (double)n_iter, 0);
+        printf("%-25.25s|  %9.4f\t%9.4f\t%9.6f\t%9.6f\t%9.6f\t%9.6f\t%9.6f\t%09.1f\t%9.2f\n", "FFTandSwitch", 100., 100 * timeRef / (timeRef + timeInit), timeRef, 0., timeRef / n_iter, 0., 0., (double)n_iter, 0.);
+        fprintf(file, "%s;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.0f;%09.2f\n", "FFTandSwitch", 100., 100 * timeRef / (timeRef + timeInit), timeRef, 0., timeRef / n_iter, 0., 0., (double)n_iter, 0.);
+        for (int i = 0; i < 6; i++) {
+            int j = order[i] - 1;
+            printf("%-25.25s|  %9.4f\t%9.4f\t%9.6f\t%9.6f\t%9.6f\t%9.6f\t%9.6f\t%09.1f\t%9.2f\n", ("-- " + P3DNames[j]).c_str(), 100. * gtmean[j] / timeRef, 100 * gtmean[j] / timeRef, gtmean[j], 0., gtmean[j] / n_iter, gtmin[j] / n_iter, gtmax[j] / n_iter, (double)n_iter, 0.);
+            fprintf(file, "%s;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.6f;%09.0f;%09.2f\n", ("-- " + P3DNames[j]).c_str(), 100. * gtmean[j] / timeRef, 100 * gtmean[j] / timeRef, gtmean[j], 0., gtmean[j] / n_iter, gtmin[j] / n_iter, gtmax[j] / n_iter, (double)n_iter, 0.);
         }
         fclose(file);
 
         // -- backup of all timers --
         filename = "prof/" + P3DFFTprof + "_backup.csv";
-        file            = fopen(filename.c_str(), "w+");
-        for(int i=0;i < 12;i++) {
-            fprintf(file,"timer[%d] (avg/max/min): %lE %lE %lE\n",i+1,gtmean[i],gtmax[i],gtmin[i]);
+        file     = fopen(filename.c_str(), "w+");
+        for (int i = 0; i < 12; i++) {
+            fprintf(file, "timer[%d] (avg/max/min): %lE %lE %lE\n", i + 1, gtmean[i], gtmax[i], gtmin[i]);
         }
         fclose(file);
-    }   
-    delete(P3Dprof);
+    }
+    delete (P3Dprof);
 
-    if(rank==0)
+    if (rank == 0)
         printf("Done ! Now let's clean...\n");
     fflush(stdout);
 
@@ -307,65 +297,62 @@ int main(int argc, char *argv[]) {
 
     topoIn->~Topology();
     topoSpec->~Topology();
-    
+
     mysolver->~Solver();
-    
-    for(int id=0; id<3; id++){
-        for(int is=0; is<2; is++){
-            mybc[id][is] = (FLUPS_BoundaryType*) flups_malloc(sizeof(int)*1);
+
+    for (int id = 0; id < 3; id++) {
+        for (int is = 0; is < 2; is++) {
+            mybc[id][is] = (FLUPS_BoundaryType *)flups_malloc(sizeof(int) * 1);
             flups_free(mybc[id][is]);
         }
     }
     Cp3dfft_clean();
+    flups_cleanup_fftw();
 
     MPI_Finalize();
 }
 
+void print_all_P3D(double *A, long int nar) {
+    int      x, y, z, conf, Fstart[3], Fsize[3], Fend[3];
+    long int i;
 
-
-void print_all_P3D(double *A,long int nar)
-{
-  int x,y,z,conf,Fstart[3],Fsize[3],Fend[3];
-  long int i;
-
-  conf = 2;
-  Cp3dfft_get_dims(Fstart,Fend,Fsize,conf);
-  /*
-  Fsize[0] *= 2;
-  Fstart[0] = (Fstart[0]-1)*2;
-  */
-  for(i=0;i < nar;i+=2)
-    if(fabs(A[i]) + fabs(A[i+1]) > 1.25e-4) {
-      z = i/(2*Fsize[0]*Fsize[1]);
-      y = i/(2*Fsize[0]) - z*Fsize[1];
-      x = i/2-z*Fsize[0]*Fsize[1] - y*Fsize[0];
-      printf("P3D(%d,%d,%d) %.16lg %.16lg\n",x+Fstart[0],y+Fstart[1],z+Fstart[2],A[i],A[i+1]);
-    }
+    conf = 2;
+    Cp3dfft_get_dims(Fstart, Fend, Fsize, conf);
+    /*
+    Fsize[0] *= 2;
+    Fstart[0] = (Fstart[0]-1)*2;
+    */
+    for (i = 0; i < nar; i += 2)
+        if (fabs(A[i]) + fabs(A[i + 1]) > 1.25e-4) {
+            z = i / (2 * Fsize[0] * Fsize[1]);
+            y = i / (2 * Fsize[0]) - z * Fsize[1];
+            x = i / 2 - z * Fsize[0] * Fsize[1] - y * Fsize[0];
+            printf("P3D(%d,%d,%d) %.16lg %.16lg\n", x + Fstart[0], y + Fstart[1], z + Fstart[2], A[i], A[i + 1]);
+        }
 }
 
-void print_res(double *A, const Topology* topo) {
-    const int ax0     = topo->axis();
-    const int ax1     = (ax0 + 1) % 3;
-    const int ax2     = (ax0 + 2) % 3;
-    const int nf = 2;//topo->nf()
-    int nmem[3];
-    
-    for(int i=0;i<3;i++){
+void print_res(double *A, const Topology *topo) {
+    const int ax0 = topo->axis();
+    const int ax1 = (ax0 + 1) % 3;
+    const int ax2 = (ax0 + 2) % 3;
+    const int nf  = 2;  // topo->nf()
+    int       nmem[3];
+
+    for (int i = 0; i < 3; i++) {
         nmem[i] = topo->nmem(i);
     }
 
     int gstart[3];
     topo->get_istart_glob(gstart);
 
-    if (topo->isComplex()){
+    if (topo->isComplex()) {
         for (int i2 = 0; i2 < topo->nloc(ax2); i2++) {
             for (int i1 = 0; i1 < topo->nloc(ax1); i1++) {
-                //local indexes start
+                // local indexes start
                 const size_t id = localIndex(ax0, 0, i1, i2, ax0, nmem, 2, 0);
                 for (int i0 = 0; i0 < topo->nloc(ax0); i0++) {
-                    
-                    if (std::fabs(A[id+i0*2]) + std::fabs(A[id+i0*2 + 1]) > 1.25e-4) {
-                        printf("FLU(%d %d %d) %lg +i1* %lg\n", i0 + gstart[ax0], i1 + gstart[ax1], i2 + gstart[ax2], A[id+i0*2], A[id+i0*2 + 1]);
+                    if (std::fabs(A[id + i0 * 2]) + std::fabs(A[id + i0 * 2 + 1]) > 1.25e-4) {
+                        printf("FLU(%d %d %d) %lg +i1* %lg\n", i0 + gstart[ax0], i1 + gstart[ax1], i2 + gstart[ax2], A[id + i0 * 2], A[id + i0 * 2 + 1]);
                     }
                 }
             }
@@ -373,15 +360,14 @@ void print_res(double *A, const Topology* topo) {
     } else {
         for (int i2 = 0; i2 < topo->nloc(ax2); i2++) {
             for (int i1 = 0; i1 < topo->nloc(ax1); i1++) {
-                //local indexes start
+                // local indexes start
                 const size_t id = localIndex(ax0, 0, i1, i2, ax0, nmem, 1, 0);
                 for (int i0 = 0; i0 < topo->nloc(ax0); i0++) {
-                    
-                    if (std::fabs(A[id+i0]) > 1.25e-4) {
-                        printf("FLU(%d %d %d) %lg \n", i0 + gstart[ax0], i1 + gstart[ax1], i2 + gstart[ax2], A[id+i0]);
+                    if (std::fabs(A[id + i0]) > 1.25e-4) {
+                        printf("FLU(%d %d %d) %lg \n", i0 + gstart[ax0], i1 + gstart[ax1], i2 + gstart[ax2], A[id + i0]);
                     }
                 }
             }
         }
     }
-}
\ No newline at end of file
+}
diff --git a/samples/simpleTest/main_simple_test.cpp b/samples/simpleTest/main_simple_test.cpp
index 70a2788..eb6235d 100644
--- a/samples/simpleTest/main_simple_test.cpp
+++ b/samples/simpleTest/main_simple_test.cpp
@@ -8,19 +8,18 @@
 #include <stdio.h>
 #include <string.h>
 
-#include "mpi.h"
-#include "h3lpr/profiler.hpp"
-#include "h3lpr/macros.hpp"
 #include "flups.h"
+#include "h3lpr/macros.hpp"
+#include "h3lpr/profiler.hpp"
+#include "mpi.h"
 
 #define m_log(format, ...) m_log_def("test", format, ##__VA_ARGS__)
 
-void print_res(double *A, const FLUPS_Topology* topo, std::string filename);
+void print_res(double *A, const FLUPS_Topology *topo, std::string filename);
 
-int main(int argc, char *argv[])  {    
-    
+int main(int argc, char *argv[]) {
     MPI_Init(&argc, &argv);
-    int rank, comm_size;
+    int      rank, comm_size;
     MPI_Comm comm = MPI_COMM_WORLD;
     MPI_Comm_rank(comm, &rank);
     MPI_Comm_size(comm, &comm_size);
@@ -29,14 +28,14 @@ int main(int argc, char *argv[])  {
     //-------------------------------------------------------------------------
     // Definition of the problem
     //-------------------------------------------------------------------------
-    bool is_node = true;
-    const int nglob[3] = {9, 9, 9};
-    const int nproc[3] = {1, 1, comm_size};
-    const double L[3] = {1., 1., 1.};
-    const double h[3] = {L[0] / (nglob[0] - is_node), L[1] / (nglob[1] - is_node), L[2] / (nglob[2] - is_node)};  
-    int nsolve = 1;
-
-    FLUPS_CenterType center_type[3]; 
+    bool         is_node  = true;
+    const int    nglob[3] = {9, 9, 9};
+    const int    nproc[3] = {1, 1, comm_size};
+    const double L[3]     = {1., 1., 1.};
+    const double h[3]     = {L[0] / (nglob[0] - is_node), L[1] / (nglob[1] - is_node), L[2] / (nglob[2] - is_node)};
+    int          nsolve   = 1;
+
+    FLUPS_CenterType    center_type[3];
     FLUPS_BoundaryType *mybc[3][2];
     for (int id = 0; id < 3; id++) {
         center_type[id] = is_node ? NODE_CENTER : CELL_CENTER;
@@ -50,19 +49,19 @@ int main(int argc, char *argv[])  {
     /** - Initialize FLUPS */
     //-------------------------------------------------------------------------
     // create a real topology
-    FLUPS_Topology *topo      = flups_topo_new(0, 1, nglob, nproc, false, NULL, FLUPS_ALIGNMENT, comm);
+    FLUPS_Topology *topo = flups_topo_new(0, 1, nglob, nproc, false, NULL, FLUPS_ALIGNMENT, comm);
 
-    // Profiler creation 
-    FLUPS_Profiler* prof = flups_profiler_new();
+    // Profiler creation
+    FLUPS_Profiler *prof = flups_profiler_new();
 
     // Solver creation and init
     FLUPS_Solver *mysolver = flups_init_timed(topo, mybc, h, L, NOD, center_type, prof);
-    flups_set_greenType(mysolver,CHAT_2);
+    flups_set_greenType(mysolver, CHAT_2);
     flups_setup(mysolver, true);
 
     // recompute the communicator and the rank
     comm = flups_topo_get_comm(topo);
-    MPI_Comm_rank(comm,&rank);
+    MPI_Comm_rank(comm, &rank);
 
     //-------------------------------------------------------------------------
     /** - allocate rhs and solution */
@@ -80,17 +79,17 @@ int main(int argc, char *argv[])  {
     const int ax1     = (ax0 + 1) % 3;
     const int ax2     = (ax0 + 2) % 3;
 
-    const double k = 2.0 * M_PI ;
-    auto fun = [k](double pos[3]) -> double { return cos(k * pos[0]); };
-    auto d2fundx = [k](double pos[3]) -> double { return (-(k*k)*cos(k * pos[0])); };
-
-    const double shift = is_node? 0.0 : 0.5;
-    for (int i2 = 0; i2 < flups_topo_get_nloc(topo,ax2) ; i2++) {
-        for (int i1 = 0; i1 < flups_topo_get_nloc(topo,ax1) ; i1++) {
-            for (int i0 = 0; i0 < flups_topo_get_nloc(topo,ax0); i0++) {
-                double pos[3] ={(istart[ax0] + i0 + shift) * h[ax0], (istart[ax1] + i1 + shift) * h[ax1], (istart[ax2] + i2 + shift) * h[ax2]};
-                const size_t id    = flups_locID(0, i0, i1, i2, 0, 0, nmem, 1);
-                rhs[id] = d2fundx(pos);
+    const double k       = 2.0 * M_PI;
+    auto         fun     = [k](double pos[3]) -> double { return cos(k * pos[0]); };
+    auto         d2fundx = [k](double pos[3]) -> double { return (-(k * k) * cos(k * pos[0])); };
+
+    const double shift = is_node ? 0.0 : 0.5;
+    for (int i2 = 0; i2 < flups_topo_get_nloc(topo, ax2); i2++) {
+        for (int i1 = 0; i1 < flups_topo_get_nloc(topo, ax1); i1++) {
+            for (int i0 = 0; i0 < flups_topo_get_nloc(topo, ax0); i0++) {
+                double       pos[3] = {(istart[ax0] + i0 + shift) * h[ax0], (istart[ax1] + i1 + shift) * h[ax1], (istart[ax2] + i2 + shift) * h[ax2]};
+                const size_t id     = flups_locID(0, i0, i1, i2, 0, 0, nmem, 1);
+                rhs[id]             = d2fundx(pos);
                 // rhs[id] = (istart[ax0] + i0) + nmem[0]*((istart[ax1] + i1) + nglob[1]*(i2 + istart[ax2])); //d2fundx(pos);
             }
         }
@@ -104,48 +103,47 @@ int main(int argc, char *argv[])  {
         flups_solve(mysolver, field, rhs, STD);
     }
 
-
     //-------------------------------------------------------------------------
     /** - Compute the error */
     //-------------------------------------------------------------------------
     double err = std::numeric_limits<double>::lowest();
-    bool is_last_topo[3]; 
-    for(int id = 0; id < 3; id++){
-        is_last_topo[id] = (istart[id] + flups_topo_get_nloc(topo,(flups_topo_get_axis(topo) + id)%3) == nglob[id]);
+    bool   is_last_topo[3];
+    for (int id = 0; id < 3; id++) {
+        is_last_topo[id] = (istart[id] + flups_topo_get_nloc(topo, (flups_topo_get_axis(topo) + id) % 3) == nglob[id]);
     }
 
-    if((0 == flups_topo_get_nloc(topo,ax2) - is_last_topo[2]) || (0 == flups_topo_get_nloc(topo,ax1) - is_last_topo[1]) || (0 == flups_topo_get_nloc(topo,ax1) - is_last_topo[0])){
+    if ((0 == flups_topo_get_nloc(topo, ax2) - is_last_topo[2]) || (0 == flups_topo_get_nloc(topo, ax1) - is_last_topo[1]) || (0 == flups_topo_get_nloc(topo, ax1) - is_last_topo[0])) {
         printf("[test - %d] You don't have enough points to fill in all the procs - my error (rank %d) is set to 0 by default \n ", rank, rank);
         err = 0.0;
     }
 
-    for (int i2 = 0; i2 < flups_topo_get_nloc(topo,ax2) - is_last_topo[2] ; i2++) {
-        for (int i1 = 0; i1 < flups_topo_get_nloc(topo,ax1) - is_last_topo[1] ; i1++) {
-            for (int i0 = 0; i0 < flups_topo_get_nloc(topo,ax0) - is_last_topo[0]; i0++) {
-                double pos[3] ={(istart[ax0] + i0 + shift) * h[ax0], (istart[ax1] + i1 + shift) * h[ax1], (istart[ax2] + i2 + shift) * h[ax2]};
-                const size_t id    = flups_locID(0, i0, i1, i2, 0, 0, nmem, 1);
+    for (int i2 = 0; i2 < flups_topo_get_nloc(topo, ax2) - is_last_topo[2]; i2++) {
+        for (int i1 = 0; i1 < flups_topo_get_nloc(topo, ax1) - is_last_topo[1]; i1++) {
+            for (int i0 = 0; i0 < flups_topo_get_nloc(topo, ax0) - is_last_topo[0]; i0++) {
+                double       pos[3] = {(istart[ax0] + i0 + shift) * h[ax0], (istart[ax1] + i1 + shift) * h[ax1], (istart[ax2] + i2 + shift) * h[ax2]};
+                const size_t id     = flups_locID(0, i0, i1, i2, 0, 0, nmem, 1);
                 // Gaussian
                 err = std::max(err, abs(field[id] - fun(pos)));
             }
         }
     }
 
-    //m_log("Printing results");
-    //printf("[test %d] Error computation - end %d %d %d ", rank,flups_topo_get_nloc(topo,ax0) - is_last_topo[0], flups_topo_get_nloc(topo,ax1) - is_last_topo[1], flups_topo_get_nloc(topo,ax2) - is_last_topo[2] );
-    //std::string arg_name = (argc == 2) ? argv[1] : "default";
-    //print_res(field, topo, "data/sol_" + arg_name) ;
+    // m_log("Printing results");
+    // printf("[test %d] Error computation - end %d %d %d ", rank,flups_topo_get_nloc(topo,ax0) - is_last_topo[0], flups_topo_get_nloc(topo,ax1) - is_last_topo[1], flups_topo_get_nloc(topo,ax2) - is_last_topo[2] );
+    // std::string arg_name = (argc == 2) ? argv[1] : "default";
+    // print_res(field, topo, "data/sol_" + arg_name) ;
 
     // --------------------------------------------------------------------------
-    // Local error 
+    // Local error
     printf("[test - %d] my error is %e \n", rank, err);
 
     // --------------------------------------------------------------------------
-    // Global error 
+    // Global error
     double err_glob = 0.0;
     MPI_Allreduce(&err, &err_glob, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
     m_log("--------------- Global Error ---------------");
     m_log("The global error == %e", err_glob);
-    
+
     flups_profiler_disp(prof);
 
     //-------------------------------------------------------------------------
@@ -155,6 +153,7 @@ int main(int argc, char *argv[])  {
     flups_free(field);
     flups_topo_free(topo);
     flups_cleanup(mysolver);
+    flups_cleanup_fftw();
     flups_profiler_free(prof);
 
     for (int id = 0; id < 3; id++) {
@@ -166,30 +165,33 @@ int main(int argc, char *argv[])  {
     MPI_Finalize();
 }
 
-void print_res(double *A, const FLUPS_Topology* topo, std::string filename) {
+void print_res(double *A, const FLUPS_Topology *topo, std::string filename) {
     int rank;
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 
-    FILE *fptr; 
+    FILE       *fptr;
     std::string name = filename + "_" + std::to_string(rank) + ".txt";
-    fptr = fopen(name.c_str(), "w");
+    fptr             = fopen(name.c_str(), "w");
 
-    if(fptr == NULL){ printf("Error!");   exit(1); }
+    if (fptr == NULL) {
+        printf("Error!");
+        exit(1);
+    }
 
     const int ax0     = flups_topo_get_axis(topo);
     const int ax1     = (ax0 + 1) % 3;
     const int ax2     = (ax0 + 2) % 3;
-    int nmem[3] = {flups_topo_get_nmem(topo,0), flups_topo_get_nmem(topo,1), flups_topo_get_nmem(topo,2)};
-    
+    int       nmem[3] = {flups_topo_get_nmem(topo, 0), flups_topo_get_nmem(topo, 1), flups_topo_get_nmem(topo, 2)};
+
     // int gstart[3];
     // flups_topo_get_istartGlob(topo,gstart);
 
-    if (flups_topo_get_isComplex(topo)){
-        for (int i2 = 0; i2 < flups_topo_get_nloc(topo,ax2); i2++) {
-            for (int i1 = 0; i1 < flups_topo_get_nloc(topo,ax1); i1++) {
-                //local indexes start
-                const size_t id = flups_locID(ax0, 0, i1, i2, 0, ax0, nmem,2);
-                for (int i0 = 0; i0 < flups_topo_get_nloc(topo,ax0); i0++) {
+    if (flups_topo_get_isComplex(topo)) {
+        for (int i2 = 0; i2 < flups_topo_get_nloc(topo, ax2); i2++) {
+            for (int i1 = 0; i1 < flups_topo_get_nloc(topo, ax1); i1++) {
+                // local indexes start
+                const size_t id = flups_locID(ax0, 0, i1, i2, 0, ax0, nmem, 2);
+                for (int i0 = 0; i0 < flups_topo_get_nloc(topo, ax0); i0++) {
                     // printf("(%d %d %d) %lg +i1 * %lg\n", i0 + gstart[ax0], i1 + gstart[ax1], i2 + gstart[ax2], A[id + i0 * 2], A[id + i0 * 2 + 1]);
                     fprintf(fptr, "\t (%e, i1 * %e)", A[id + i0 * 2], A[id + i0 * 2 + 1]);
                 }
@@ -197,13 +199,13 @@ void print_res(double *A, const FLUPS_Topology* topo, std::string filename) {
             }
             fprintf(fptr, "\n \n");
         }
-    
+
     } else {
-        for (int i2 = 0; i2 < flups_topo_get_nloc(topo,ax2); i2++) {
-            for (int i1 = 0; i1 < flups_topo_get_nloc(topo,ax1); i1++) {
-                //local indexes start
-                const size_t id = flups_locID(ax0, 0, i1, i2, 0, ax0, nmem,1);
-                for (int i0 = 0; i0 < flups_topo_get_nloc(topo,ax0); i0++) {
+        for (int i2 = 0; i2 < flups_topo_get_nloc(topo, ax2); i2++) {
+            for (int i1 = 0; i1 < flups_topo_get_nloc(topo, ax1); i1++) {
+                // local indexes start
+                const size_t id = flups_locID(ax0, 0, i1, i2, 0, ax0, nmem, 1);
+                for (int i0 = 0; i0 < flups_topo_get_nloc(topo, ax0); i0++) {
                     // printf("(%d %d %d) %lg \n", i0 + gstart[ax0], i1 + gstart[ax1], i2 + gstart[ax2], A[id + i0]);
                     fprintf(fptr, "\t %e", A[id + i0]);
                 }
diff --git a/samples/solve_advanced_C/main_solve_advanced.c b/samples/solve_advanced_C/main_solve_advanced.c
index b2ef303..a05eb98 100644
--- a/samples/solve_advanced_C/main_solve_advanced.c
+++ b/samples/solve_advanced_C/main_solve_advanced.c
@@ -8,35 +8,35 @@
 #include <stdio.h>
 #include <string.h>
 
-#include "mpi.h"
 #include "flups.h"
+#include "mpi.h"
 
-void print_res(double *A, const FLUPS_Topology* topo);
+void print_res(double *A, const FLUPS_Topology *topo);
 
 #define PRINT_RES
 
-int main(int argc, char *argv[]) {    
+int main(int argc, char *argv[]) {
     printf("Starting...\n");
     //-------------------------------------------------------------------------
     // Initialize MPI
     //-------------------------------------------------------------------------
-    int rank, comm_size;
+    int      rank, comm_size;
     MPI_Comm comm = MPI_COMM_WORLD;
 
     int provided;
     // set MPI_THREAD_FUNNELED or MPI_THREAD_SERIALIZED
     int requested = MPI_THREAD_FUNNELED;
     MPI_Init_thread(&argc, &argv, requested, &provided);
-    if(provided < requested){
+    if (provided < requested) {
         printf("Invalid number of procs\n");
-        MPI_Abort(comm,1);
+        MPI_Abort(comm, 1);
     }
-   
+
     MPI_Comm_rank(comm, &rank);
     MPI_Comm_size(comm, &comm_size);
 
     //-------------------------------------------------------------------------
-    //Definition of the problem
+    // Definition of the problem
     //-------------------------------------------------------------------------
     const int     nglob[3] = {64, 64, 64};
     const int     nproc[3] = {1, 1, comm_size};
@@ -46,52 +46,51 @@ int main(int argc, char *argv[]) {
 
     const FLUPS_CenterType center_type[3] = {CELL_CENTER, CELL_CENTER, CELL_CENTER};
 
-    FLUPS_BoundaryType* mybc[3][2];
-    for(int id=0; id<3; id++){
-        for(int is=0; is<2; is++){
-            mybc[id][is] = (FLUPS_BoundaryType*) flups_malloc(sizeof(int)*1);
+    FLUPS_BoundaryType *mybc[3][2];
+    for (int id = 0; id < 3; id++) {
+        for (int is = 0; is < 2; is++) {
+            mybc[id][is]    = (FLUPS_BoundaryType *)flups_malloc(sizeof(int) * 1);
             mybc[id][is][0] = PER;
         }
     }
 
-    if(comm_size!=nproc[0]*nproc[1]*nproc[2]){
+    if (comm_size != nproc[0] * nproc[1] * nproc[2]) {
         printf("Invalid number of procs\n");
-        MPI_Abort(comm,1);
+        MPI_Abort(comm, 1);
     }
 
-
     //-------------------------------------------------------------------------
     /** - Initialize FLUPS */
     //-------------------------------------------------------------------------
 
     // create a real topology
-    FLUPS_Topology *topoIn      = flups_topo_new(0, 1, nglob, nproc, false, NULL, FLUPS_ALIGNMENT, comm);
+    FLUPS_Topology *topoIn = flups_topo_new(0, 1, nglob, nproc, false, NULL, FLUPS_ALIGNMENT, comm);
 
     // solver creation and init
     FLUPS_Solver *mysolver = flups_init(topoIn, mybc, h, L, NOD, center_type);
-    flups_set_greenType(mysolver,CHAT_2);
+    flups_set_greenType(mysolver, CHAT_2);
     flups_setup(mysolver, true);
-    double* solFLU = flups_get_innerBuffer(mysolver);
+    double *solFLU = flups_get_innerBuffer(mysolver);
 
     // recompute the communicator and the rank
     comm = flups_topo_get_comm(topoIn);
-    MPI_Comm_rank(comm,&rank);
+    MPI_Comm_rank(comm, &rank);
 
     // retrieveing internal info from the solver
     const FLUPS_Topology *topoSpec = flups_get_innerTopo_spectral(mysolver);
 
     int nmemIn[3];
     int istartIn[3], istartSpec[3];
-    int nmemIN[3],nmemSpec[3];
+    int nmemIN[3], nmemSpec[3];
 
     int memsizeIN = flups_topo_get_memsize(topoIn);
     for (int i = 0; i < 3; i++) {
         nmemIn[i]   = flups_topo_get_nmem(topoIn, i);
         nmemSpec[i] = flups_topo_get_nmem(topoSpec, i);
     }
-    flups_topo_get_istartGlob(topoIn,istartIn);
-    flups_topo_get_istartGlob(topoSpec,istartSpec);
-    
+    flups_topo_get_istartGlob(topoIn, istartIn);
+    flups_topo_get_istartGlob(topoSpec, istartSpec);
+
     // //-------------------------------------------------------------------------
     // /** - allocate rhs and solution */
     // //-------------------------------------------------------------------------
@@ -105,27 +104,27 @@ int main(int argc, char *argv[]) {
         fflush(stdout);
     }
 
-    double *rhsFLU   = (double *)flups_malloc(sizeof(double) * memsizeIN);
-    memset(rhsFLU, 0, sizeof(double ) * memsizeIN);
-    // memset(solFLU, 0, sizeof(double ) * FLUmemsizeOUT); 
-    
-    double f = 1; //frequency of the wave
-    const double c_2pi = 2.0*3.141592653589793;
+    double *rhsFLU = (double *)flups_malloc(sizeof(double) * memsizeIN);
+    memset(rhsFLU, 0, sizeof(double) * memsizeIN);
+    // memset(solFLU, 0, sizeof(double ) * FLUmemsizeOUT);
 
-    for (int i2 = 0; i2 < flups_topo_get_nloc(topoIn,2); i2++) {
-        for (int i1 = 0; i1 < flups_topo_get_nloc(topoIn,1); i1++) {
-            for (int i0 = 0; i0 < flups_topo_get_nloc(topoIn,0); i0++) {
-                double       x    = (istartIn[0] + i0 + 0.5) * h[0];
-                double       y    = (istartIn[1] + i1 + 0.5) * h[1];
-                double       z    = (istartIn[2] + i2 + 0.5) * h[2];
+    double       f     = 1;  // frequency of the wave
+    const double c_2pi = 2.0 * 3.141592653589793;
+
+    for (int i2 = 0; i2 < flups_topo_get_nloc(topoIn, 2); i2++) {
+        for (int i1 = 0; i1 < flups_topo_get_nloc(topoIn, 1); i1++) {
+            for (int i0 = 0; i0 < flups_topo_get_nloc(topoIn, 0); i0++) {
+                double x = (istartIn[0] + i0 + 0.5) * h[0];
+                double y = (istartIn[1] + i1 + 0.5) * h[1];
+                double z = (istartIn[2] + i2 + 0.5) * h[2];
 
                 size_t id;
-                id    = flups_locID(0, i0, i1, i2, 0, 0, nmemIn, 1);
+                id         = flups_locID(0, i0, i1, i2, 0, 0, nmemIn, 1);
                 rhsFLU[id] = sin((c_2pi / L[0] * f) * x) * sin((c_2pi / L[1] * f) * y) * sin((c_2pi / L[2] * f) * z);
             }
         }
     }
-    memcpy(solFLU, rhsFLU, sizeof(double ) * memsizeIN);
+    memcpy(solFLU, rhsFLU, sizeof(double) * memsizeIN);
 
     // //-------------------------------------------------------------------------
     // /** - Skip the copy of data
@@ -133,9 +132,9 @@ int main(int argc, char *argv[]) {
     // By using the advanced features of the API, we skip the copy which is done
     // in flups_solve: data are copied from the location the user allocated himself,
     // to an inner memory reserved by flups, and ensuring proper alignment.
-    // In this example, we have directly filled that "inner memory" (solFlu) with 
+    // In this example, we have directly filled that "inner memory" (solFlu) with
     // the initial condition
-    
+
     // flups_do_copy(mysolver,topoIn,solFLU,FLUPS_FORWARD);
 
     // //-------------------------------------------------------------------------
@@ -153,26 +152,26 @@ int main(int argc, char *argv[]) {
     // //-------------------------------------------------------------------------
 
     double kfact[3], koffset[3], symstart[3];
-    flups_get_spectralInfo(mysolver,kfact,koffset,symstart);
+    flups_get_spectralInfo(mysolver, kfact, koffset, symstart);
     if (rank == 0) {
-        printf("%lf %lf %lf  %lf %lf %lf  %lf %lf %lf  \n",kfact[0],kfact[1],kfact[2],koffset[0],koffset[1],koffset[2],symstart[0],symstart[1],symstart[2]);
+        printf("%lf %lf %lf  %lf %lf %lf  %lf %lf %lf  \n", kfact[0], kfact[1], kfact[2], koffset[0], koffset[1], koffset[2], symstart[0], symstart[1], symstart[2]);
     }
 
-    const int ax0     = flups_topo_get_axis(topoSpec);
-    const int ax1     = (ax0 + 1) % 3;
-    const int ax2     = (ax0 + 2) % 3;
-    const int nf      = 2; //topoSpec is full spectral, there are two doubles per element  
-    
+    const int ax0 = flups_topo_get_axis(topoSpec);
+    const int ax1 = (ax0 + 1) % 3;
+    const int ax2 = (ax0 + 2) % 3;
+    const int nf  = 2;  // topoSpec is full spectral, there are two doubles per element
+
     // int nmemSpec[3];
     // for(int i=0;i<3;i++){
     //     nmemSpec[i] = flups_topo_get_nmem(topoSpec,i);
     // }
-        
-    for (int i2 = 0; i2 < flups_topo_get_nloc(topoSpec,ax2); i2++) {
-        for (int i1 = 0; i1 < flups_topo_get_nloc(topoSpec,ax1); i1++) {
-            //local indexes start
-            const size_t id = flups_locID(ax0, 0, i1, i2, 0, ax0, nmemSpec,nf);
-            for (int i0 = 0; i0 < flups_topo_get_nloc(topoSpec,ax0); i0++) {
+
+    for (int i2 = 0; i2 < flups_topo_get_nloc(topoSpec, ax2); i2++) {
+        for (int i1 = 0; i1 < flups_topo_get_nloc(topoSpec, ax1); i1++) {
+            // local indexes start
+            const size_t id = flups_locID(ax0, 0, i1, i2, 0, ax0, nmemSpec, nf);
+            for (int i0 = 0; i0 < flups_topo_get_nloc(topoSpec, ax0); i0++) {
                 int is[3];
                 flups_symID(ax0, i0, i1, i2, istartSpec, symstart, 0, is);
 
@@ -181,10 +180,10 @@ int main(int argc, char *argv[]) {
                 const double k1 = (is[ax1] + koffset[ax1]) * kfact[ax1];
                 const double k2 = (is[ax2] + koffset[ax2]) * kfact[ax2];
 
-                const double ksqr   = k0 * k0 + k1 * k1 + k2 * k2;
+                const double ksqr = k0 * k0 + k1 * k1 + k2 * k2;
 
-                solFLU[id + i0 * nf] *= exp(-ksqr); //REAL part
-                solFLU[id + i0 * nf + 1] *= exp(-ksqr); //COMPLEX part
+                solFLU[id + i0 * nf] *= exp(-ksqr);      // REAL part
+                solFLU[id + i0 * nf + 1] *= exp(-ksqr);  // COMPLEX part
             }
         }
     }
@@ -209,7 +208,7 @@ int main(int argc, char *argv[]) {
 // #define PRINT_RES
 #ifdef PRINT_RES
     /* normalize*/
-    print_res(solFLU,topoIn);
+    print_res(solFLU, topoIn);
 #endif
 
     flups_hdf5_dump(topoIn, "result", solFLU);
@@ -217,58 +216,59 @@ int main(int argc, char *argv[]) {
     // //-------------------------------------------------------------------------
     // /** - CLEAN */
     // //-------------------------------------------------------------------------
-    
-    if(rank==0)
+
+    if (rank == 0)
         printf("Done ! Now let's clean...\n");
     fflush(stdout);
-    
+
     flups_free(rhsFLU);
 
     flups_topo_free(topoIn);
     flups_cleanup(mysolver);
+    flups_cleanup_fftw();
 
-    for(int id=0; id<3; id++){
-        for(int is=0; is<2; is++){
+    for (int id = 0; id < 3; id++) {
+        for (int is = 0; is < 2; is++) {
             flups_free(mybc[id][is]);
         }
     }
-    
+
     MPI_Finalize();
 }
 
-void print_res(double *A, const FLUPS_Topology* topo) {
-    const int ax0     = flups_topo_get_axis(topo);
-    const int ax1     = (ax0 + 1) % 3;
-    const int ax2     = (ax0 + 2) % 3;
-    int nmem[3];
-    
-    for(int i=0;i<3;i++){
-        nmem[i] = flups_topo_get_nmem(topo,i);
+void print_res(double *A, const FLUPS_Topology *topo) {
+    const int ax0 = flups_topo_get_axis(topo);
+    const int ax1 = (ax0 + 1) % 3;
+    const int ax2 = (ax0 + 2) % 3;
+    int       nmem[3];
+
+    for (int i = 0; i < 3; i++) {
+        nmem[i] = flups_topo_get_nmem(topo, i);
     }
 
     int gstart[3];
-    flups_topo_get_istartGlob(topo,gstart);
+    flups_topo_get_istartGlob(topo, gstart);
 
-    if (flups_topo_get_isComplex(topo)){
-        for (int i2 = 0; i2 < flups_topo_get_nloc(topo,ax2); i2++) {
+    if (flups_topo_get_isComplex(topo)) {
+        for (int i2 = 0; i2 < flups_topo_get_nloc(topo, ax2); i2++) {
             printf("(z = %d) \n", i2 + gstart[ax2]);
-            for (int i1 = 0; i1 < flups_topo_get_nloc(topo,ax1); i1++) {
-                //local indexes start
-                const size_t id = flups_locID(ax0, 0, i1, i2, 0, ax0, nmem,2);
-                for (int i0 = 0; i0 < flups_topo_get_nloc(topo,ax0); i0++) {
+            for (int i1 = 0; i1 < flups_topo_get_nloc(topo, ax1); i1++) {
+                // local indexes start
+                const size_t id = flups_locID(ax0, 0, i1, i2, 0, ax0, nmem, 2);
+                for (int i0 = 0; i0 < flups_topo_get_nloc(topo, ax0); i0++) {
                     printf("(%1.3lg + i1* %1.3lg) \t", A[id + i0 * 2], A[id + i0 * 2 + 1]);
                 }
-                printf("\n");   
+                printf("\n");
             }
             printf("\n");
         }
     } else {
-        for (int i2 = 0; i2 < flups_topo_get_nloc(topo,ax2); i2++) {
+        for (int i2 = 0; i2 < flups_topo_get_nloc(topo, ax2); i2++) {
             printf("(z = %d) \n", i2 + gstart[ax2]);
-            for (int i1 = 0; i1 < flups_topo_get_nloc(topo,ax1); i1++) {
-                //local indexes start
-                const size_t id = flups_locID(ax0, 0, i1, i2, 0, ax0, nmem,1);
-                for (int i0 = 0; i0 < flups_topo_get_nloc(topo,ax0); i0++) {
+            for (int i1 = 0; i1 < flups_topo_get_nloc(topo, ax1); i1++) {
+                // local indexes start
+                const size_t id = flups_locID(ax0, 0, i1, i2, 0, ax0, nmem, 1);
+                for (int i0 = 0; i0 < flups_topo_get_nloc(topo, ax0); i0++) {
                     printf(" %lg \t", A[id + i0]);
                 }
                 printf("\n");
diff --git a/samples/solve_vtube/src/vtube.cpp b/samples/solve_vtube/src/vtube.cpp
index 5747707..4dd5fd6 100644
--- a/samples/solve_vtube/src/vtube.cpp
+++ b/samples/solve_vtube/src/vtube.cpp
@@ -1,14 +1,14 @@
 /**
- * @file vtube.cpp
  * @copyright Copyright (c) Université catholique de Louvain (UCLouvain), Belgique 
  *      See LICENSE file in top-level directory
  */
 
 #include "vtube.hpp"
+
 #include "omp.h"
 
-template <> 
-double gexpint<1>(const double z){
+template <>
+double gexpint<1>(const double z) {
     // for real values E1(x) = -Ei(-x):
     // according to
     // https://en.cppreference.com/w/cpp/numeric/special_functions/expint and
@@ -16,9 +16,9 @@ double gexpint<1>(const double z){
     return (-std::expint(-z));
 }
 
-double vort_inf(const double r, const double sigma){
+double vort_inf(const double r, const double sigma) {
     // Gregoire winckelmans 2004 encyclopedia
-    const double rho1  = r / sigma;
+    const double rho1 = r / sigma;
     return 1.0 / (c_2pi * sigma * sigma) * exp(-rho1 * rho1 * 0.5);
 }
 double vel_inf(const double r, const double sigma) {
@@ -28,10 +28,10 @@ double vel_inf(const double r, const double sigma) {
     return (r < eps) ? 0 : 1.0 / (c_2pi * r) * (1.0 - exp(-rho1 * rho1 * 0.5));
 }
 
-double vort_compact(const double r, const double R){
+double vort_compact(const double r, const double R) {
     // Wincklemans 2015, hand-written notes (cfr vortexbib)
-    const double rho2  = r / (R);
-    const double oo_rad2 = 1.0/(R*R);
+    const double rho2    = r / (R);
+    const double oo_rad2 = 1.0 / (R * R);
     return (rho2 >= 1.0) ? (0.0) : (1.0 / c_2pi * (2.0 * oo_rad2) * (1.0 / gexpint<2>(1.0)) * exp(-1.0 / (1.0 - rho2 * rho2)));
 }
 double vel_compact(const double r, const double R) {
@@ -67,9 +67,9 @@ void vtube(const DomainDescr myCase, const FLUPS_GreenType typeGreen, const int
     const double fact    = (double)(!is_cell);
 
     // Ensure that 64 points in cell-centred mode is equivalent 64 points in node-centred mode
-    const int    nglob[3] = {myCase.nglob[0] + (int)fact,
-                             myCase.nglob[1] + (int)fact,
-                             myCase.nglob[2] + (int)fact};
+    const int nglob[3] = {myCase.nglob[0] + (int)fact,
+                          myCase.nglob[1] + (int)fact,
+                          myCase.nglob[2] + (int)fact};
 
     // h definition depends on if we are cell- of node-centred
     const double h[3] = {L[0] / (nglob[0] - fact),
@@ -78,13 +78,13 @@ void vtube(const DomainDescr myCase, const FLUPS_GreenType typeGreen, const int
 
     FLUPS_CenterType center_type[3];
     for (int i = 0; i < 3; i++) {
-            center_type[i] = myCase.center[i];
+        center_type[i] = myCase.center[i];
     }
 
-    FLUPS_BoundaryType* mybc[3][2];
-    for(int id=0; id<3; id++){
-        for(int is=0; is<2; is++){
-            mybc[id][is] = (FLUPS_BoundaryType*) flups_malloc(sizeof(int)*lda);
+    FLUPS_BoundaryType *mybc[3][2];
+    for (int id = 0; id < 3; id++) {
+        for (int is = 0; is < 2; is++) {
+            mybc[id][is] = (FLUPS_BoundaryType *)flups_malloc(sizeof(int) * lda);
         }
     }
 
@@ -104,7 +104,7 @@ void vtube(const DomainDescr myCase, const FLUPS_GreenType typeGreen, const int
     //-------------------------------------------------------------------------
     std::string     name = "tube_" + std::to_string((int)(nglob[0] / L[0])) + "_nrank" + std::to_string(comm_size) + "_nthread" + std::to_string(omp_get_max_threads());
     FLUPS_Profiler *prof = flups_profiler_new_n(name.c_str());
-    FLUPS_Solver *  mysolver;
+    FLUPS_Solver   *mysolver;
     mysolver = flups_init_timed(topo, mybc, h, L, order, center_type, prof);
 
     flups_set_greenType(mysolver, typeGreen);
@@ -148,8 +148,8 @@ void vtube(const DomainDescr myCase, const FLUPS_GreenType typeGreen, const int
         double *rhs1 = rhs + flups_locID(ax0, 0, 0, 0, 1, ax0, nmem, 1);
         double *rhs2 = rhs + flups_locID(ax0, 0, 0, 0, 2, ax0, nmem, 1);
         double *sol0 = sol + flups_locID(ax0, 0, 0, 0, 0, ax0, nmem, 1);
-        double* sol1 = sol + flups_locID(ax0, 0, 0, 0, 1, ax0, nmem, 1);
-        double* sol2 = sol + flups_locID(ax0, 0, 0, 0, 2, ax0, nmem, 1);
+        double *sol1 = sol + flups_locID(ax0, 0, 0, 0, 1, ax0, nmem, 1);
+        double *sol2 = sol + flups_locID(ax0, 0, 0, 0, 2, ax0, nmem, 1);
 
         int dir0 = (vdir + 1) % 3;
         int dir1 = (vdir + 2) % 3;
@@ -298,53 +298,53 @@ void vtube(const DomainDescr myCase, const FLUPS_GreenType typeGreen, const int
                                 sol2[id] += 0.0 * myCase.ysign * myCase.xsign;
                             }
                         }
-                    // } else if (cross) {
-                    //     const double lpos[3] = {pos[0] - (myCase.xcntr * L[0]),
-                    //                             pos[1] - (myCase.ycntr * L[1]),
-                    //                             pos[2] - (myCase.zcntr * L[2])};
-                    //     double      *lrhs[3] = {rhs0, rhs1, rhs2};
-                    //     double      *lsol[3] = {sol0, sol1, sol2};
-
-                    //     for (int i = 0; i < 3; ++i) {
-                    //         lrhs[(i + 0) % 3][id] = 0.0;
-                    //         lrhs[(i + 1) % 3][id] = 0.0;
-                    //         lrhs[(i + 2) % 3][id] = 0.0;
-                    //         lsol[(i + 0) % 3][id] = 0.0;
-                    //         lsol[(i + 1) % 3][id] = 0.0;
-                    //         lsol[(i + 2) % 3][id] = 0.0;
-                    //     }
-
-                    //     for (int i = 0; i < 3; ++i) {
-                    //         const double x = lpos[(i + 1) % 3];
-                    //         const double y = lpos[(i + 2) % 3];
-                    //         // tube in z
-                    //         const double theta = std::atan2(y, x);  // get the angle in the x-y plane
-                    //         const double r     = sqrt(x * x + y * y);
-                    //         const double vort  = (compact) ? vort_compact(r, R) : vort_inf(r, sigma);
-                    //         const double vel   = (compact) ? vel_compact(r, R) : vel_inf(r, sigma);
-                    //         lrhs[(i + 0) % 3][id] += -vort;
-                    //         lrhs[(i + 1) % 3][id] += 0.0;
-                    //         lrhs[(i + 2) % 3][id] += 0.0;
-                    //         lsol[(i + 0) % 3][id] += 0.0;
-                    //         lsol[(i + 1) % 3][id] += -sin(theta) * vel;
-                    //         lsol[(i + 2) % 3][id] += +cos(theta) * vel;
-                    //     }
-                    // } else if (gauss) {
-                    //     const double x = pos[0] - (myCase.xcntr * L[0]);
-                    //     const double y = pos[1] - (myCase.ycntr * L[1]);
-                    //     const double z = pos[2] - (myCase.zcntr * L[2]);
-                    //     // tube in z
-                    //     const double r     = sqrt(x * x + y * y + z*z);
-                    //     const double rho = r/sigma;
-                    //     const double vort  = 1.0/(4.0*M_PI*pow(sigma,3)) * sqrt(2.0/M_PI) * exp(-0.5 * rho * rho) ;
-                    //     const double vel   = 1.0;
-                    //     lrhs[(i + 0) % 3][id] += -vort;
-                    //     lrhs[(i + 1) % 3][id] += 0.0;
-                    //     lrhs[(i + 2) % 3][id] += 0.0;
-                    //     lsol[(i + 0) % 3][id] += 0.0;
-                    //     lsol[(i + 1) % 3][id] += -sin(theta) * vel;
-                    //     lsol[(i + 2) % 3][id] += +cos(theta) * vel;
-                    // }
+                        // } else if (cross) {
+                        //     const double lpos[3] = {pos[0] - (myCase.xcntr * L[0]),
+                        //                             pos[1] - (myCase.ycntr * L[1]),
+                        //                             pos[2] - (myCase.zcntr * L[2])};
+                        //     double      *lrhs[3] = {rhs0, rhs1, rhs2};
+                        //     double      *lsol[3] = {sol0, sol1, sol2};
+
+                        //     for (int i = 0; i < 3; ++i) {
+                        //         lrhs[(i + 0) % 3][id] = 0.0;
+                        //         lrhs[(i + 1) % 3][id] = 0.0;
+                        //         lrhs[(i + 2) % 3][id] = 0.0;
+                        //         lsol[(i + 0) % 3][id] = 0.0;
+                        //         lsol[(i + 1) % 3][id] = 0.0;
+                        //         lsol[(i + 2) % 3][id] = 0.0;
+                        //     }
+
+                        //     for (int i = 0; i < 3; ++i) {
+                        //         const double x = lpos[(i + 1) % 3];
+                        //         const double y = lpos[(i + 2) % 3];
+                        //         // tube in z
+                        //         const double theta = std::atan2(y, x);  // get the angle in the x-y plane
+                        //         const double r     = sqrt(x * x + y * y);
+                        //         const double vort  = (compact) ? vort_compact(r, R) : vort_inf(r, sigma);
+                        //         const double vel   = (compact) ? vel_compact(r, R) : vel_inf(r, sigma);
+                        //         lrhs[(i + 0) % 3][id] += -vort;
+                        //         lrhs[(i + 1) % 3][id] += 0.0;
+                        //         lrhs[(i + 2) % 3][id] += 0.0;
+                        //         lsol[(i + 0) % 3][id] += 0.0;
+                        //         lsol[(i + 1) % 3][id] += -sin(theta) * vel;
+                        //         lsol[(i + 2) % 3][id] += +cos(theta) * vel;
+                        //     }
+                        // } else if (gauss) {
+                        //     const double x = pos[0] - (myCase.xcntr * L[0]);
+                        //     const double y = pos[1] - (myCase.ycntr * L[1]);
+                        //     const double z = pos[2] - (myCase.zcntr * L[2]);
+                        //     // tube in z
+                        //     const double r     = sqrt(x * x + y * y + z*z);
+                        //     const double rho = r/sigma;
+                        //     const double vort  = 1.0/(4.0*M_PI*pow(sigma,3)) * sqrt(2.0/M_PI) * exp(-0.5 * rho * rho) ;
+                        //     const double vel   = 1.0;
+                        //     lrhs[(i + 0) % 3][id] += -vort;
+                        //     lrhs[(i + 1) % 3][id] += 0.0;
+                        //     lrhs[(i + 2) % 3][id] += 0.0;
+                        //     lsol[(i + 0) % 3][id] += 0.0;
+                        //     lsol[(i + 1) % 3][id] += -sin(theta) * vel;
+                        //     lsol[(i + 2) % 3][id] += +cos(theta) * vel;
+                        // }
                     } else if (ring) {
                         //---------------------------------------------------------------
                         //                      WARNING
@@ -409,13 +409,13 @@ void vtube(const DomainDescr myCase, const FLUPS_GreenType typeGreen, const int
                         }
                     } else if (periodic) {
                         {
-                            const double x = pos[0];
-                            const double y = pos[1];
-                            const double z = pos[2];
+                            const double x    = pos[0];
+                            const double y    = pos[1];
+                            const double z    = pos[2];
                             const double k[3] = {2.0 * M_PI, 2.0 * M_PI, 2.0 * M_PI};
-                            rhs0[id] = cos(k[1] * y) / k[1];
-                            rhs1[id] = cos(k[2] * z) / k[2];
-                            rhs2[id] = cos(k[0] * x) / k[0];
+                            rhs0[id]          = cos(k[1] * y) / k[1];
+                            rhs1[id]          = cos(k[2] * z) / k[2];
+                            rhs2[id]          = cos(k[0] * x) / k[0];
                             // rot_x = sin(k[2] * z) -> sol = -1/k[2] * sin(k[2] * z)
                             // rot_y = sin(k[0] * x) -> sol = -1/k[0] * sin(k[0] * x)
                             // rot_z = sin(k[1] * y) -> sol = -1/k[1] * sin(k[1] * y)
@@ -447,15 +447,15 @@ void vtube(const DomainDescr myCase, const FLUPS_GreenType typeGreen, const int
                             const double z    = pos[2];
                             const double k[3] = {1.5 * M_PI, 1.5 * M_PI, 1.5 * M_PI};
 
-                            rhs0[id] = 0.0; //sin(k[1] * y) / k[1];
+                            rhs0[id] = 0.0;  // sin(k[1] * y) / k[1];
                             rhs1[id] = sin(k[2] * z) / k[2];
-                            rhs2[id] = 0.0; //sin(k[0] * x) / k[0];
+                            rhs2[id] = 0.0;  // sin(k[0] * x) / k[0];
                             // rot_x = cos(k[2] * z) -> sol = -1/k[2]^2 * cos(k[2] * z)
                             // rot_y = cos(k[0] * x) -> sol = -1/k[0]^2 * cos(k[0] * x)
                             // rot_z = cos(k[1] * y) -> sol = -1/k[1]^2 * cos(k[1] * y)
                             sol0[id] = -1.0 / (k[2] * k[2]) * cos(k[2] * z);
-                            sol1[id] = 0.0; //-1.0 / (k[0] * k[0]) * cos(k[0] * x);
-                            sol2[id] = 0.0; //-1.0 / (k[1] * k[1]) * cos(k[1] * y);
+                            sol1[id] = 0.0;  //-1.0 / (k[0] * k[0]) * cos(k[0] * x);
+                            sol2[id] = 0.0;  //-1.0 / (k[1] * k[1]) * cos(k[1] * y);
                         }
                     }
                 }
@@ -478,11 +478,9 @@ void vtube(const DomainDescr myCase, const FLUPS_GreenType typeGreen, const int
         flups_solve(mysolver, field, rhs, ROT);
     }
 
-
     flups_profiler_disp(prof);
     flups_profiler_free(prof);
 
-
 #ifdef DUMP_DBG
     // write the source term and the solution
     sprintf(msg, "sol_%d%d%d%d%d%d_%dx%dx%d", mybc[0][0][0], mybc[0][1][0], mybc[1][0][0], mybc[1][1][0], mybc[2][0][0], mybc[2][1][0], nglob[0], nglob[1], nglob[2]);
@@ -502,7 +500,7 @@ void vtube(const DomainDescr myCase, const FLUPS_GreenType typeGreen, const int
     std::memset(err2, 0, sizeof(double) * lda);
     std::memset(erri, 0, sizeof(double) * lda);
 
-    //determine the volume associated to a mesh
+    // determine the volume associated to a mesh
     double vol = 1.0;
     for (int id = 0; id < 3; id++) {
         if (mybc[id][0][0] != NONE && mybc[id][1][0] != NONE) {
@@ -516,23 +514,23 @@ void vtube(const DomainDescr myCase, const FLUPS_GreenType typeGreen, const int
         const int ax2     = (ax0 + 2) % 3;
         const int nmem[3] = {flups_topo_get_nmem(topo, 0), flups_topo_get_nmem(topo, 1), flups_topo_get_nmem(topo, 2)};
         for (int lia = 0; lia < lda; lia++) {
-            // Check if flups is able to compute the last point of the domain. If at least one of the direction is periodic, 
+            // Check if flups is able to compute the last point of the domain. If at least one of the direction is periodic,
             // and we are in an node centred framework, then the last point in each direction may be influenced
-            const bool last_point_valid = !(!is_cell && ( mybc[lia][0][0] == PER || mybc[lia][0][1] == PER || mybc[lia][0][2] == PER ));
-            
+            const bool last_point_valid = !(!is_cell && (mybc[lia][0][0] == PER || mybc[lia][0][1] == PER || mybc[lia][0][2] == PER));
+
             // If the last point is not valid, remove it from the error computation
             const int iend[3] = {flups_topo_get_nloc(topo, ax0),
                                  flups_topo_get_nloc(topo, ax1),
                                  flups_topo_get_nloc(topo, ax2)};
-            
+
             for (int i2 = 0; i2 < iend[2]; i2++) {
                 for (int i1 = 0; i1 < iend[1]; i1++) {
                     for (int i0 = 0; i0 < iend[0]; i0++) {
                         const size_t id  = flups_locID(ax0, i0, i1, i2, lia, ax0, nmem, 1);
                         const double err = sol[id] - field[id];
-                        lerri[lia] = max(lerri[lia], fabs(err));
+                        lerri[lia]       = max(lerri[lia], fabs(err));
                         lerr2[lia] += (err * err) * vol;
-                        error[id]  = err; 
+                        error[id] = err;
                     }
                 }
             }
@@ -552,9 +550,9 @@ void vtube(const DomainDescr myCase, const FLUPS_GreenType typeGreen, const int
 #endif
 
     char   filename[512];
-    string folder = "./data";
-    string ct_name = is_cell ? "CellCenter" : "NodeCenter"; 
-    sprintf(filename, "%s/%s_%s_%d_%d%d%d_typeGreen=%d.txt", folder.c_str(), __func__, ct_name.c_str(), type, vdir, (int) myCase.xsign, (int)myCase.ysign, typeGreen);
+    string folder  = "./data";
+    string ct_name = is_cell ? "CellCenter" : "NodeCenter";
+    sprintf(filename, "%s/%s_%s_%d_%d%d%d_typeGreen=%d.txt", folder.c_str(), __func__, ct_name.c_str(), type, vdir, (int)myCase.xsign, (int)myCase.ysign, typeGreen);
 
     if (rank == 0) {
         struct stat st = {0};
@@ -585,12 +583,13 @@ void vtube(const DomainDescr myCase, const FLUPS_GreenType typeGreen, const int
     free(lerri);
     free(err2);
     free(erri);
-    
+
     flups_free(sol);
     flups_free(rhs);
     flups_free(field);
     flups_free(error);
     flups_cleanup(mysolver);
+    flups_cleanup_fftw();
     flups_topo_free(topo);
 
     for (int id = 0; id < 3; id++) {
diff --git a/samples/validation/data_ref/validation_3d_CellCenter_333399_typeGreen=11.txt b/samples/validation/data_ref/validation_3d_CellCenter_333399_typeGreen=11.txt
new file mode 100644
index 0000000..e856869
--- /dev/null
+++ b/samples/validation/data_ref/validation_3d_CellCenter_333399_typeGreen=11.txt
@@ -0,0 +1 @@
+16 3.311760673069e-02 6.001754483240e-02 
diff --git a/samples/validation/data_ref/validation_3d_CellCenter_333399_typeGreen=12.txt b/samples/validation/data_ref/validation_3d_CellCenter_333399_typeGreen=12.txt
new file mode 100644
index 0000000..e856869
--- /dev/null
+++ b/samples/validation/data_ref/validation_3d_CellCenter_333399_typeGreen=12.txt
@@ -0,0 +1 @@
+16 3.311760673069e-02 6.001754483240e-02 
diff --git a/samples/validation/data_ref/validation_3d_CellCenter_333399_typeGreen=13.txt b/samples/validation/data_ref/validation_3d_CellCenter_333399_typeGreen=13.txt
new file mode 100644
index 0000000..cb19938
--- /dev/null
+++ b/samples/validation/data_ref/validation_3d_CellCenter_333399_typeGreen=13.txt
@@ -0,0 +1 @@
+16 3.296578486463e-04 5.974240491287e-04 
diff --git a/samples/validation/data_ref/validation_3d_CellCenter_333399_typeGreen=14.txt b/samples/validation/data_ref/validation_3d_CellCenter_333399_typeGreen=14.txt
new file mode 100644
index 0000000..492d43f
--- /dev/null
+++ b/samples/validation/data_ref/validation_3d_CellCenter_333399_typeGreen=14.txt
@@ -0,0 +1 @@
+16 4.420020480575e-05 8.010203741826e-05 
diff --git a/samples/validation/data_ref/validation_3d_CellCenter_401499_typeGreen=11.txt b/samples/validation/data_ref/validation_3d_CellCenter_401499_typeGreen=11.txt
new file mode 100644
index 0000000..2decaf8
--- /dev/null
+++ b/samples/validation/data_ref/validation_3d_CellCenter_401499_typeGreen=11.txt
@@ -0,0 +1 @@
+16 8.279381136966e-03 5.177451801366e-02 
diff --git a/samples/validation/data_ref/validation_3d_CellCenter_401499_typeGreen=12.txt b/samples/validation/data_ref/validation_3d_CellCenter_401499_typeGreen=12.txt
new file mode 100644
index 0000000..2decaf8
--- /dev/null
+++ b/samples/validation/data_ref/validation_3d_CellCenter_401499_typeGreen=12.txt
@@ -0,0 +1 @@
+16 8.279381136966e-03 5.177451801366e-02 
diff --git a/samples/validation/data_ref/validation_3d_CellCenter_401499_typeGreen=13.txt b/samples/validation/data_ref/validation_3d_CellCenter_401499_typeGreen=13.txt
new file mode 100644
index 0000000..9ac0bea
--- /dev/null
+++ b/samples/validation/data_ref/validation_3d_CellCenter_401499_typeGreen=13.txt
@@ -0,0 +1 @@
+16 2.601533757485e-04 8.942064632086e-04 
diff --git a/samples/validation/data_ref/validation_3d_CellCenter_401499_typeGreen=14.txt b/samples/validation/data_ref/validation_3d_CellCenter_401499_typeGreen=14.txt
new file mode 100644
index 0000000..ced3cde
--- /dev/null
+++ b/samples/validation/data_ref/validation_3d_CellCenter_401499_typeGreen=14.txt
@@ -0,0 +1 @@
+16 6.344010847012e-05 2.062116091302e-04 
diff --git a/samples/validation/data_ref/validation_3d_NodeCenter_333399_typeGreen=11.txt b/samples/validation/data_ref/validation_3d_NodeCenter_333399_typeGreen=11.txt
new file mode 100644
index 0000000..6f600b9
--- /dev/null
+++ b/samples/validation/data_ref/validation_3d_NodeCenter_333399_typeGreen=11.txt
@@ -0,0 +1 @@
+17 3.311760673069e-02 6.623521346139e-02 
diff --git a/samples/validation/data_ref/validation_3d_NodeCenter_333399_typeGreen=12.txt b/samples/validation/data_ref/validation_3d_NodeCenter_333399_typeGreen=12.txt
new file mode 100644
index 0000000..6f600b9
--- /dev/null
+++ b/samples/validation/data_ref/validation_3d_NodeCenter_333399_typeGreen=12.txt
@@ -0,0 +1 @@
+17 3.311760673069e-02 6.623521346139e-02 
diff --git a/samples/validation/data_ref/validation_3d_NodeCenter_333399_typeGreen=13.txt b/samples/validation/data_ref/validation_3d_NodeCenter_333399_typeGreen=13.txt
new file mode 100644
index 0000000..b05f12e
--- /dev/null
+++ b/samples/validation/data_ref/validation_3d_NodeCenter_333399_typeGreen=13.txt
@@ -0,0 +1 @@
+17 3.296578486463e-04 6.593156972932e-04 
diff --git a/samples/validation/data_ref/validation_3d_NodeCenter_333399_typeGreen=14.txt b/samples/validation/data_ref/validation_3d_NodeCenter_333399_typeGreen=14.txt
new file mode 100644
index 0000000..21100ec
--- /dev/null
+++ b/samples/validation/data_ref/validation_3d_NodeCenter_333399_typeGreen=14.txt
@@ -0,0 +1 @@
+17 4.420020480569e-05 8.840040961200e-05 
diff --git a/samples/validation/data_ref/validation_3d_NodeCenter_401499_typeGreen=11.txt b/samples/validation/data_ref/validation_3d_NodeCenter_401499_typeGreen=11.txt
new file mode 100644
index 0000000..f50098b
--- /dev/null
+++ b/samples/validation/data_ref/validation_3d_NodeCenter_401499_typeGreen=11.txt
@@ -0,0 +1 @@
+17 8.290370265556e-03 5.322438967995e-02 
diff --git a/samples/validation/data_ref/validation_3d_NodeCenter_401499_typeGreen=12.txt b/samples/validation/data_ref/validation_3d_NodeCenter_401499_typeGreen=12.txt
new file mode 100644
index 0000000..f50098b
--- /dev/null
+++ b/samples/validation/data_ref/validation_3d_NodeCenter_401499_typeGreen=12.txt
@@ -0,0 +1 @@
+17 8.290370265556e-03 5.322438967995e-02 
diff --git a/samples/validation/data_ref/validation_3d_NodeCenter_401499_typeGreen=13.txt b/samples/validation/data_ref/validation_3d_NodeCenter_401499_typeGreen=13.txt
new file mode 100644
index 0000000..b2d4500
--- /dev/null
+++ b/samples/validation/data_ref/validation_3d_NodeCenter_401499_typeGreen=13.txt
@@ -0,0 +1 @@
+17 2.785479427274e-04 1.315704724925e-03 
diff --git a/samples/validation/data_ref/validation_3d_NodeCenter_401499_typeGreen=14.txt b/samples/validation/data_ref/validation_3d_NodeCenter_401499_typeGreen=14.txt
new file mode 100644
index 0000000..cbce81c
--- /dev/null
+++ b/samples/validation/data_ref/validation_3d_NodeCenter_401499_typeGreen=14.txt
@@ -0,0 +1 @@
+17 7.436135343861e-05 4.195883266850e-04 
diff --git a/samples/validation/scripts/test_3D_kerns.py b/samples/validation/scripts/test_3D_kerns.py
index 5c90e36..66321ef 100644
--- a/samples/validation/scripts/test_3D_kerns.py
+++ b/samples/validation/scripts/test_3D_kerns.py
@@ -59,10 +59,6 @@
             print("skip kernel 7 and 2dirunbounded... unsupported error due to inherent approximation.")
             continue
 
-        if (kern in ['11','12','13','14'] and bcs[4:6] == ["9","9"]) :
-            print("Mehrstellen stencils not implemented for 2D problems")
-            continue
-
         if(bcs[4:6] == ["9","9"]):
             str_bcs = str(bcs[0]) + "," + str(bcs[1])+ "," + str(bcs[2])+ "," + str(bcs[3])+ "," + str(bcs[4])+ "," + str(bcs[5]) 
             r = subprocess.run(["mpirun"] + ["-np"] + ["2"] + ["./flups_validation_nb"] + ["--np=1,2,1"] + ["--kernel="+str(kern)] + ["--center=" + str(centerType)] + ["--res=16,16,1"] + ["--nres=1"] + ["--bc="+str_bcs], capture_output=True)
diff --git a/samples/validation/src/validation_3d.cpp b/samples/validation/src/validation_3d.cpp
index 03ed591..a37a081 100644
--- a/samples/validation/src/validation_3d.cpp
+++ b/samples/validation/src/validation_3d.cpp
@@ -4,11 +4,11 @@
  *      See LICENSE file in top-level directory
  */
 
-#include <unistd.h>
-#include <limits.h>
-
 #include "validation_3d.hpp"
 
+#include <limits.h>
+#include <unistd.h>
+
 #include "omp.h"
 
 using namespace std;
@@ -26,31 +26,31 @@ void validation_3d(const DomainDescr myCase, const FLUPS_GreenType typeGreen, co
  * @param nSolve number of times we call the same solver (for timing)
  */
 void validation_3d(const DomainDescr myCase, const FLUPS_GreenType typeGreen, const int lda, const int nSolve, const std::string output_dir) {
-    int rank, comm_size;
+    int      rank, comm_size;
     MPI_Comm comm = MPI_COMM_WORLD;
     MPI_Comm_rank(comm, &rank);
     MPI_Comm_size(comm, &comm_size);
 
-    const int *   nproc  = myCase.nproc;
-    const double *L      = myCase.L;
+    const int    *nproc = myCase.nproc;
+    const double *L     = myCase.L;
 
-    const bool is_cell = myCase.center_type[0] == CELL_CENTER; 
-    const double fact = (double) (!is_cell);
+    const bool   is_cell = myCase.center_type[0] == CELL_CENTER;
+    const double fact    = (double)(!is_cell);
 
-    const int nglob[3] = {myCase.nglob[0] + (int)fact, myCase.nglob[1] + (int)fact,
+    const int    nglob[3] = {myCase.nglob[0] + (int)fact, myCase.nglob[1] + (int)fact,
                           (myCase.nglob[2] == 1) ? 1 : myCase.nglob[2] + (int)fact};
-    const double h[3] = {L[0] / (nglob[0] - fact), L[1] / (nglob[1] - fact),
+    const double h[3]     = {L[0] / (nglob[0] - fact), L[1] / (nglob[1] - fact),
                          nglob[2] == 1 ? 1 : L[2] / (nglob[2] - fact)};
 
     FLUPS_CenterType center_type[3];
     for (int i = 0; i < 3; i++) {
-            center_type[i] = myCase.center_type[i];
+        center_type[i] = myCase.center_type[i];
     }
 
-    FLUPS_BoundaryType* mybc[3][2];
-    for(int id=0; id<3; id++){
-        for(int is=0; is<2; is++){
-            mybc[id][is] = (FLUPS_BoundaryType*) flups_malloc(sizeof(int)*lda);
+    FLUPS_BoundaryType *mybc[3][2];
+    for (int id = 0; id < 3; id++) {
+        for (int is = 0; is < 2; is++) {
+            mybc[id][is] = (FLUPS_BoundaryType *)flups_malloc(sizeof(int) * lda);
         }
     }
 
@@ -81,9 +81,9 @@ void validation_3d(const DomainDescr myCase, const FLUPS_GreenType typeGreen, co
     //-------------------------------------------------------------------------
     std::string     name = "validation_nx" + std::to_string((int)(nglob[0])) + "_ny" + std::to_string((int)(nglob[1])) + "_nz" + std::to_string((int)(nglob[2])) + "_nrank" + std::to_string(comm_size) + "_nthread" + std::to_string(omp_get_max_threads());
     FLUPS_Profiler *prof = flups_profiler_new_n(name.c_str());
-    
+
     m_profStart(prof, "Validation--Init-Flups");
-    FLUPS_Solver *  mysolver;
+    FLUPS_Solver *mysolver;
     mysolver = flups_init_timed(topo, mybc, h, L, NOD, center_type, prof);
 
     flups_set_greenType(mysolver, typeGreen);
@@ -94,7 +94,7 @@ void validation_3d(const DomainDescr myCase, const FLUPS_GreenType typeGreen, co
     MPI_Comm_rank(comm, &rank);
     m_profStop(prof, "Validation--Init-Flups");
 
-    //flups_switchtopo_info(mysolver);
+    // flups_switchtopo_info(mysolver);
 
     //-------------------------------------------------------------------------
     /** - allocate rhs and solution */
@@ -126,19 +126,19 @@ void validation_3d(const DomainDescr myCase, const FLUPS_GreenType typeGreen, co
      */
     for (int lia = 0; lia < lda; lia++) {
         for (int j2 = -1; j2 < 2; j2++) {
-            if (j2 != 0 && mybc[2][(j2 + 1) / 2] == UNB) continue;  //skip unbounded dirs
+            if (j2 != 0 && mybc[2][(j2 + 1) / 2] == UNB) continue;  // skip unbounded dirs
             for (int j1 = -1; j1 < 2; j1++) {
-                if (j1 != 0 && mybc[1][(j1 + 1) / 2] == UNB) continue;  //skip unbounded dirs
+                if (j1 != 0 && mybc[1][(j1 + 1) / 2] == UNB) continue;  // skip unbounded dirs
                 for (int j0 = -1; j0 < 2; j0++) {
-                    if (j0 != 0 && mybc[0][(j0 + 1) / 2] == UNB) continue;  //skip unbounded dirs
+                    if (j0 != 0 && mybc[0][(j0 + 1) / 2] == UNB) continue;  // skip unbounded dirs
 
                     double sign = 1.0;
                     double centerPos[3];
-                    double orig[3] = {j0 * L[0], j1 * L[1], j2 * L[2]};  //inner left corner of the current block i'm looking at
+                    double orig[3] = {j0 * L[0], j1 * L[1], j2 * L[2]};  // inner left corner of the current block i'm looking at
 
-                    sign *= j0 == 0 ? 1.0 : 1 - 2 * (mybc[0][(j0 + 1) / 2] == ODD);  //multiply by -1 if the symm is odd
-                    sign *= j1 == 0 ? 1.0 : 1 - 2 * (mybc[1][(j1 + 1) / 2] == ODD);  //multiply by -1 if the symm is odd
-                    sign *= j2 == 0 ? 1.0 : 1 - 2 * (mybc[2][(j2 + 1) / 2] == ODD);  //multiply by -1 if the symm is odd
+                    sign *= j0 == 0 ? 1.0 : 1 - 2 * (mybc[0][(j0 + 1) / 2] == ODD);  // multiply by -1 if the symm is odd
+                    sign *= j1 == 0 ? 1.0 : 1 - 2 * (mybc[1][(j1 + 1) / 2] == ODD);  // multiply by -1 if the symm is odd
+                    sign *= j2 == 0 ? 1.0 : 1 - 2 * (mybc[2][(j2 + 1) / 2] == ODD);  // multiply by -1 if the symm is odd
 
                     centerPos[0] = orig[0] + ((j0 != 0) && (mybc[0][(j0 + 1) / 2] != PER) ? (1.0 - center[0]) * L[0] : (center[0] * L[0]));
                     centerPos[1] = orig[1] + ((j1 != 0) && (mybc[1][(j1 + 1) / 2] != PER) ? (1.0 - center[1]) * L[1] : (center[1] * L[1]));
@@ -278,13 +278,13 @@ void validation_3d(const DomainDescr myCase, const FLUPS_GreenType typeGreen, co
             }
         }
 
-        //USE THE FOLLOWING TO TEST THE K=0 PART OF THE 1DIRUNBOUNDED KERNEL
-        // manuRHS[0] = &fZero;
-        // manuSol[0] = &fCst;
-        // manuRHS[1] = &fZero;
-        // manuSol[1] = &fCst;
-        // manuRHS[2] = &d2dx2_fUnbSpietz;
-        // manuSol[2] = &fUnbSpietz;
+        // USE THE FOLLOWING TO TEST THE K=0 PART OF THE 1DIRUNBOUNDED KERNEL
+        //  manuRHS[0] = &fZero;
+        //  manuSol[0] = &fCst;
+        //  manuRHS[1] = &fZero;
+        //  manuSol[1] = &fCst;
+        //  manuRHS[2] = &d2dx2_fUnbSpietz;
+        //  manuSol[2] = &fUnbSpietz;
 
         {
             // Obtaining the reference sol and rhs
@@ -298,32 +298,32 @@ void validation_3d(const DomainDescr myCase, const FLUPS_GreenType typeGreen, co
             for (int i2 = 0; i2 < flups_topo_get_nloc(topo, ax2); i2++) {
                 for (int i1 = 0; i1 < flups_topo_get_nloc(topo, ax1); i1++) {
                     for (int i0 = 0; i0 < flups_topo_get_nloc(topo, ax0); i0++) {
-                        const size_t id   = flups_locID(ax0, i0, i1, i2, lia, ax0, nmem, 1);
-                        const double shift = is_cell ? 0.5: 0.0;
-                        const double x[3] = {(istart[ax0] + i0 + shift) * h[ax0],
-                                             (istart[ax1] + i1 + shift) * h[ax1],
-                                             (istart[ax2] + i2 + shift) * h[ax2]};
+                        const size_t id    = flups_locID(ax0, i0, i1, i2, lia, ax0, nmem, 1);
+                        const double shift = is_cell ? 0.5 : 0.0;
+                        const double x[3]  = {(istart[ax0] + i0 + shift) * h[ax0],
+                                              (istart[ax1] + i1 + shift) * h[ax1],
+                                              (istart[ax2] + i2 + shift) * h[ax2]};
 
                         // const double y[3] = {0.,(x[1]-.5)/params.sigma[1],(x[2]-.5)/params.sigma[2] };
                         // const double rsq = y[1]*y[1] + y[2]*y[2] ; //((x[1]-.5)*(x[1]-.5)+(x[2]-.5)*(x[2]-.5)) / .25;
                         // const double r = sqrt(rsq);
 
-                        //CST(z):
-                        // sol[id] = fabs(rsq)>=1. ?  0.0 : exp(c_C * (1. - 1. / (1. - rsq))) ;
-                        // rhs[id] = fabs(rsq) >= 1. ? 0.0 : 4.*c_C* exp(c_C * (1. - 1. / (1. - rsq))) / (pow(rsq - 1., 4) * params.sigma[id] * params.sigma[id]) * \
+                        // CST(z):
+                        //  sol[id] = fabs(rsq)>=1. ?  0.0 : exp(c_C * (1. - 1. / (1. - rsq))) ;
+                        //  rhs[id] = fab s(rsq) >= 1. ? 0.0 : 4.*c_C* exp(c_C * (1. - 1. / (1. - rsq))) / (pow(rsq - 1., 4) * params.sigma[id] * params.sigma[id]) * \
                         //              (c_C * rsq - 1. + pow(y[1],4) + pow(y[2],4) + 2.* y[1]*y[1]*y[2]*y[2]) ;
 
-                        //SIN(z):
-                        // sol[id] = fabs(rsq) >= 1. ? 0.0 : exp(c_C * (1. - 1. / (1. - rsq))) * (sin(2. * M_PI * x[0] / L[0]));
-                        // rhs[id] = fabs(rsq) >= 1. ? 0.0 : 4.*c_C* exp(c_C * (1. - 1. / (1. - rsq))) / (pow(rsq - 1., 4) * params.sigma[id] * params.sigma[id]) * \
+                        // SIN(z):
+                        //  sol[id] = fabs(rsq) >= 1. ? 0.0 : exp(c_C * (1. - 1. / (1. - rsq))) * (sin(2. * M_PI * x[0] / L[0]));
+                        //  rhs[id] = fab s(rsq) >= 1. ? 0.0 : 4.*c_C* exp(c_C * (1. - 1. / (1. - rsq))) / (pow(rsq - 1., 4) * params.sigma[id] * params.sigma[id]) * \
                         //              (c_C * rsq - 1. + pow(y[1],4) + pow(y[2],4) + 2.* y[1]*y[1]*y[2]*y[2]) * (sin(2.*M_PI*x[0] /L[0])) ;
-                        // rhs[id] += fabs(rsq) >= 1. ? 0.0 : -sin(2*M_PI*x[0] /L[0]) * (2. * M_PI / L[0])* (2. * M_PI / L[0])  * exp(c_C * (1. - 1. / (1. - rsq))) ;
+                        //  rhs[id] += fabs(rsq) >= 1. ? 0.0 : -sin(2*M_PI*x[0] /L[0]) * (2. * M_PI / L[0])* (2. * M_PI / L[0])  * exp(c_C * (1. - 1. / (1. - rsq))) ;
 
-                        //CST(z)+sin(z):
-                        // sol[id] = fabs(rsq) >= 1. ? 0.0 : exp(c_C * (1. - 1. / (1. - rsq))) * (1. + sin(2. * M_PI * x[0] / L[0]));
-                        // rhs[id] = fabs(rsq) >= 1. ? 0.0 : 4.*c_C* exp(c_C * (1. - 1. / (1. - rsq))) / (pow(rsq - 1., 4) * params.sigma[id] * params.sigma[id]) * \
+                        // CST(z)+sin(z):
+                        //  sol[id] = fabs(rsq) >= 1. ? 0.0 : exp(c_C * (1. - 1. / (1. - rsq))) * (1. + sin(2. * M_PI * x[0] / L[0]));
+                        //  rhs[id] = fab s(rsq) >= 1. ? 0.0 : 4.*c_C* exp(c_C * (1. - 1. / (1. - rsq))) / (pow(rsq - 1., 4) * params.sigma[id] * params.sigma[id]) * \
                         //              (c_C * rsq - 1. + pow(y[1],4) + pow(y[2],4) + 2.* y[1]*y[1]*y[2]*y[2]) * (1.+sin(2.*M_PI*x[0] /L[0])) ;
-                        // rhs[id] += fabs(rsq) >= 1. ? 0.0 : -sin(2*M_PI*x[0] /L[0]) * (2. * M_PI / L[0])* (2. * M_PI / L[0])  * exp(c_C * (1. - 1. / (1. - rsq))) ;
+                        //  rhs[id] += fabs(rsq) >= 1. ? 0.0 : -sin(2*M_PI*x[0] /L[0]) * (2. * M_PI / L[0])* (2. * M_PI / L[0])  * exp(c_C * (1. - 1. / (1. - rsq))) ;
 
                         for (int dir = 0; dir < 3; dir++) {
                             const int dir2 = (dir + 1) % 3;
@@ -341,7 +341,6 @@ void validation_3d(const DomainDescr myCase, const FLUPS_GreenType typeGreen, co
 #endif
     m_profStop(prof, "Validation--Init-RHS");
 
-
 #ifdef DUMP_DBG
     char msg[512];
     // write the source term and the solution
@@ -351,17 +350,16 @@ void validation_3d(const DomainDescr myCase, const FLUPS_GreenType typeGreen, co
     flups_hdf5_dump(topo, msg, sol);
 #endif
 
-
     //-------------------------------------------------------------------------
     /** - Warm up */
     //-------------------------------------------------------------------------
     for (int is = 0; is < 5; is++) {
         MPI_Barrier(MPI_COMM_WORLD);
         m_profStart(prof, "Validation--Warm-up");
-        flups_solve(mysolver, field, rhs,STD);
+        flups_solve(mysolver, field, rhs, STD);
         m_profStop(prof, "Validation--Warm-up");
         MPI_Barrier(MPI_COMM_WORLD);
-    }	
+    }
 
     //-------------------------------------------------------------------------
     /** - solve the equations */
@@ -369,14 +367,13 @@ void validation_3d(const DomainDescr myCase, const FLUPS_GreenType typeGreen, co
     for (int is = 0; is < nSolve; is++) {
         MPI_Barrier(MPI_COMM_WORLD);
         m_profStart(prof, "Validation--Solve");
-        flups_solve(mysolver, field, rhs,STD);
+        flups_solve(mysolver, field, rhs, STD);
         m_profStop(prof, "Validation--Solve");
         MPI_Barrier(MPI_COMM_WORLD);
     }
 
     flups_profiler_disp(prof);
 
-
 #ifdef DUMP_DBG
     // write the source term and the solution
     sprintf(msg, "sol_%d%d%d%d%d%d_%dx%dx%d", mybc[0][0][0], mybc[0][1][0], mybc[1][0][0], mybc[1][1][0], mybc[2][0][0], mybc[2][1][0], nglob[0], nglob[1], nglob[2]);
@@ -396,7 +393,7 @@ void validation_3d(const DomainDescr myCase, const FLUPS_GreenType typeGreen, co
     std::memset(err2, 0, sizeof(double) * lda);
     std::memset(erri, 0, sizeof(double) * lda);
 
-    //determine the volume associated to a mesh
+    // determine the volume associated to a mesh
     double vol = 1.0;
     for (int id = 0; id < 3; id++) {
         if (mybc[id][0][0] != NONE && mybc[id][1][0] != NONE) {
@@ -430,12 +427,10 @@ void validation_3d(const DomainDescr myCase, const FLUPS_GreenType typeGreen, co
         err2[i] = sqrt(err2[i]);
     }
 
-    
-    
-    string folder = output_dir + "/data";
-    string ct_name = is_cell ? "CellCenter" : "NodeCenter"; 
+    string folder  = output_dir + "/data";
+    string ct_name = is_cell ? "CellCenter" : "NodeCenter";
     char   filename[PATH_MAX];
-    sprintf(filename, "%s/%s_%s_%d%d%d%d%d%d_typeGreen=%d.txt", folder.c_str(),  __func__, ct_name.c_str(),  mybc[0][0][0], mybc[0][1][0], mybc[1][0][0], mybc[1][1][0], mybc[2][0][0], mybc[2][1][0], typeGreen);
+    sprintf(filename, "%s/%s_%s_%d%d%d%d%d%d_typeGreen=%d.txt", folder.c_str(), __func__, ct_name.c_str(), mybc[0][0][0], mybc[0][1][0], mybc[1][0][0], mybc[1][1][0], mybc[2][0][0], mybc[2][1][0], typeGreen);
 
     if (rank == 0) {
         struct stat st_output = {0};
@@ -443,9 +438,9 @@ void validation_3d(const DomainDescr myCase, const FLUPS_GreenType typeGreen, co
         if (stat(output_dir.c_str(), &st_output) == -1) {
             mkdir(output_dir.c_str(), 0770);
         }
-        
+
         if (stat(folder.c_str(), &st_folder) == -1) {
-                mkdir(folder.c_str(), 0770);
+            mkdir(folder.c_str(), 0770);
         }
 
         FILE *myfile = fopen(filename, "a+");
@@ -479,6 +474,7 @@ void validation_3d(const DomainDescr myCase, const FLUPS_GreenType typeGreen, co
     flups_free(rhs);
     flups_free(field);
     flups_cleanup(mysolver);
+    flups_cleanup_fftw();
     flups_profiler_free(prof);
     flups_topo_free(topo);
 
diff --git a/src/Solver.cpp b/src/Solver.cpp
index 0a99d84..9d4518d 100644
--- a/src/Solver.cpp
+++ b/src/Solver.cpp
@@ -48,7 +48,7 @@ Solver::Solver(Topology *topo, BoundaryType *rhsbc[3][2], const double h[3], con
         if (fftw_alignment_of(&(data[i])) == 0) {
             // if we are above the minimum requirement, generate an error
             if (i > alignSize) {
-                FLUPS_CHECK(false, "The FLUPS alignement has to be a multiple integer of the FFTW alignement, please change the constant variable FLUPS_ALIGNMENT into file flups.h accordingly: FFTW=%d vs FLUPS=%d", fftwalignment_, FLUPS_ALIGNMENT);
+                FLUPS_CHECK(false, "The FLUPS alignement has to be a multiple integer of the FFTW alignement, please change the constant variable FLUPS_ALIGNMENT into file flups_interface.h accordingly: FFTW=%d vs FLUPS=%d", fftwalignment_, FLUPS_ALIGNMENT);
             }
             // else, just stop and advise the user to change
             break;
@@ -505,11 +505,7 @@ Solver::~Solver() {
     //    fftw_export_wisdom_to_filename(FLUPS_WISDOM_PATH);
     //#endif
 
-#if FLUPS_OPENMP    
-    fftw_cleanup_threads();
-#endif
 
-    fftw_cleanup();
     // m_profStopi(prof_, "Clean up");
     //-------------------------------------------------------------------------
     END_FUNC;
@@ -1038,7 +1034,6 @@ void Solver::cmptGreenFunction_(Topology *topo[3], double *green, FFTW_plan_dim
         FLUPS_INFO(">> using Green function type %d on 3 dir unbounded", typeGreen_);
         cmpt_Green_3dirunbounded(topo[0], hfact, symstart, green, typeGreen_, kernelLength);
     } else if ((n_unbounded) == 2) {
-        FLUPS_CHECK(!(typeGreen_ == LGF_2 && nbr_spectral == 1), "You cannot use LGF with one spectral direction!!");
         FLUPS_INFO(">> using Green function of type %d on 2 dir unbounded", typeGreen_);
         cmpt_Green_2dirunbounded(topo[0], hfact, kfact, koffset, symstart, green, typeGreen_, kernelLength);
     } else if ((n_unbounded) == 1) {
@@ -1091,7 +1086,10 @@ void Solver::cmptGreenFunction_(Topology *topo[3], double *green, FFTW_plan_dim
     /** - Complete the Green function in 2dirunbounded regularized case: we rewrite on the whole domain
      *      except the plane where k=0 in the spectral direction, as this was correctly computed. */
     // No need to scale this as that part of the Green function has a volfact = 1
-    if (ndim_ == 3 && nbr_spectral == 1 && (typeGreen_ == HEJ_2 || typeGreen_ == HEJ_4 || typeGreen_ == HEJ_6 || typeGreen_ == HEJ_8 || typeGreen_ == HEJ_10)) {
+    const bool is_hej = (typeGreen_ == HEJ_2 || typeGreen_ == HEJ_4 || typeGreen_ == HEJ_6 || typeGreen_ == HEJ_8 || typeGreen_ == HEJ_10);
+    const bool is_lgf = (typeGreen_ == LGF_2 || typeGreen_ == LGF_4 || typeGreen_ == LGF_6 || typeGreen_ == LGF_8);
+    const bool is_meh = (typeGreen_ == MEHR_4L || typeGreen_ == MEHR_4F || typeGreen_ == MEHR_6L || typeGreen_ == MEHR_6F);
+    if (ndim_ == 3 && nbr_spectral == 1 && (is_hej || is_lgf || is_meh)) {
         int istart_cstm[3] = {0, 0, 0};  // global
 
         for (int ip = 0; ip < 3; ip++) {
diff --git a/src/Solver.hpp b/src/Solver.hpp
index cd68ad8..cf5ba46 100644
--- a/src/Solver.hpp
+++ b/src/Solver.hpp
@@ -33,6 +33,9 @@
 #include "metis.h"
 #endif
 
+
+
+
 /**
  * @brief The Poisson solver
  *
diff --git a/src/flups.cpp b/src/flups.cpp
index 97abc7a..cf258d5 100644
--- a/src/flups.cpp
+++ b/src/flups.cpp
@@ -8,16 +8,16 @@
 
 #include <algorithm>  // std::remove
 
+#include "FFTW_plan_dim.hpp"
 #include "Solver.hpp"
 #include "Topology.hpp"
-#include "FFTW_plan_dim.hpp"
 #include "defines.hpp"
 #include "h3lpr/profiler.hpp"
 
 extern "C" {
 
 void* flups_malloc(size_t size) {
-    return m_calloc(size); 
+    return m_calloc(size);
 }
 
 void flups_free(void* data) {
@@ -111,9 +111,24 @@ Solver* flups_init_timed(Topology* t, BoundaryType* bc[3][2], const double h[3],
 
 // destroy the solver
 void flups_cleanup(Solver* s) {
+    flups_cleanup_all(s);
+}
+
+// Destroy the solver and cleanup fftw
+void flups_cleanup_all(Solver* s) {
     delete s;
+    flups_cleanup_fftw();
 }
 
+// cleanup all the structures related to fftw
+void flups_cleanup_fftw() {
+#if FLUPS_OPENMP
+    fftw_cleanup_threads();
+#endif
+
+    fftw_cleanup();
+};
+
 // setup the solver
 void flups_set_greenType(Solver* s, const GreenType type) {
     s->set_GreenType(type);
@@ -142,7 +157,7 @@ void flups_set_alpha(Solver* s, const double alpha) {
     s->set_alpha(alpha);
 }
 
-double* flups_get_innerBuffer(FLUPS_Solver* s){
+double* flups_get_innerBuffer(FLUPS_Solver* s) {
     return s->get_innerBuffer();
 }
 
@@ -154,7 +169,7 @@ Topology* flups_get_innerTopo_spectral(Solver* s) {
     return s->get_innerTopo_spectral();
 }
 
-void flups_skip_firstSwitchtopo(Solver* s){
+void flups_skip_firstSwitchtopo(Solver* s) {
     s->skip_firstSwitchtopo();
 }
 
diff --git a/src/flups.h b/src/flups.h
index f957e0d..b095a36 100644
--- a/src/flups.h
+++ b/src/flups.h
@@ -381,6 +381,26 @@ FLUPS_Solver* flups_init_timed(FLUPS_Topology* t, FLUPS_BoundaryType* bc[3][2],
  */
 void flups_cleanup(FLUPS_Solver* s);
 
+/**
+ * @brief Free the memomry of the solver and the data employed by fftw
+ *
+ * @warning this function free all the data of fftw. If you use multiple solver
+ *        you should not call this function
+ *
+ * @param s
+ */
+void flups_cleanup_all(FLUPS_Solver* s);
+
+/**
+ * @brief Free data employed by fftw
+ *
+ * @warning this function free all the data of fftw. If you use multiple solver, this function
+ * should be called after all the solver have freed
+ *
+ */
+void flups_cleanup_fftw(); 
+
+
 /**
  * @brief sets the type of the Green's function used by the solver
  *
diff --git a/src/green_functions.cpp b/src/green_functions.cpp
index 8d0d466..7080499 100644
--- a/src/green_functions.cpp
+++ b/src/green_functions.cpp
@@ -204,6 +204,11 @@ void cmpt_Green_2dirunbounded(const Topology *topo, const double hfact[3], const
     //Implementation note: if you want to do Helmolz, you need Hankel functions (3rd order Bessel) which are not implemented in stdC. Consider the use of boost lib.
     //notice that bessel_k has been introduced in c++17
 
+    // check for isotropy
+    const bool unb_dirs_are_isotropic = ((hfact[0] == 0.0) && (hfact[1] == hfact[2])) ||
+                                        ((hfact[1] == 0.0) && (hfact[0] == hfact[2])) ||
+                                        ((hfact[2] == 0.0) && (hfact[0] == hfact[1]));
+
     GreenKernel G;    // the Green kernel (general expression in the whole domain)
     GreenKernel Gk0;  // the Green kernel (particular expression in k=0)
     GreenKernel Gr0;  // the Green kernel (particular expression in r=0)
@@ -257,8 +262,8 @@ void cmpt_Green_2dirunbounded(const Topology *topo, const double hfact[3], const
             // caution: the value of G in k=r=0 is specified at the end of this routine
             break;
         case LGF_2:
-            FLUPS_CHECK(hfact[2] < 1.0e-14, "This LGF cannot be called in a 3D problem -> h[3] = %e",hfact[3]);
-            FLUPS_CHECK(hfact[0] == hfact[1], "the grid has to be isotropic to use the LGFs");
+            FLUPS_WARNING("LGF kernels in 2dirunbounded 1dirspectral entail an approximation in 3D.");
+            FLUPS_CHECK(unb_dirs_are_isotropic, "the grid has to be isotropic to use the LGFs");
             // read the LGF data and store it
             lgf_readfile_(LGF_2, 2, &GN, &Gdata);
             // associate the Green's function
@@ -267,40 +272,60 @@ void cmpt_Green_2dirunbounded(const Topology *topo, const double hfact[3], const
             Gr0 = &lgf_2_2unb0spe_;
             break;
         case LGF_4:
-            FLUPS_CHECK(hfact[2] < 1.0e-14, "This LGF cannot be called in a 3D problem -> h[3] = %e",hfact[3]);
-            FLUPS_CHECK(hfact[0] == hfact[1], "the grid has to be isotropic to use the LGFs");
+            FLUPS_WARNING("LGF kernels in 2dirunbounded 1dirspectral entail an approximation in 3D.");
+            FLUPS_CHECK(unb_dirs_are_isotropic, "the grid has to be isotropic to use the LGFs");
             lgf_readfile_(LGF_4, 2, &GN, &Gdata);
             G   = &zero_;
             Gk0 = &lgf_4_2unb0spe_;
             Gr0 = &lgf_4_2unb0spe_;
             break;
         case LGF_6:
-            FLUPS_CHECK(hfact[2] < 1.0e-14, "This LGF cannot be called in a 3D problem -> h[3] = %e",hfact[3]);
-            FLUPS_CHECK(hfact[0] == hfact[1], "the grid has to be isotropic to use the LGFs");
+            FLUPS_WARNING("LGF kernels in 2dirunbounded 1dirspectral entail an approximation in 3D.");
+            FLUPS_CHECK(unb_dirs_are_isotropic, "the grid has to be isotropic to use the LGFs");
             lgf_readfile_(LGF_6, 2, &GN, &Gdata);
             G   = &zero_;
             Gk0 = &lgf_6_2unb0spe_;
             Gr0 = &lgf_6_2unb0spe_;
             break;
         case LGF_8:
-            FLUPS_CHECK(hfact[2] < 1.0e-14, "This LGF cannot be called in a 3D problem -> h[3] = %e",hfact[3]);
-            FLUPS_CHECK(hfact[0] == hfact[1], "the grid has to be isotropic to use the LGFs");
+            FLUPS_WARNING("LGF kernels in 2dirunbounded 1dirspectral entail an approximation in 3D.");
+            FLUPS_CHECK(unb_dirs_are_isotropic, "the grid has to be isotropic to use the LGFs");
             lgf_readfile_(LGF_8, 2, &GN, &Gdata);
             G   = &zero_;
             Gk0 = &lgf_8_2unb0spe_;
             Gr0 = &lgf_8_2unb0spe_;
             break;
         case MEHR_4L:
-            FLUPS_CHECK(false, "MEHR4 kernel not available for 2D unbounded problems.");
+            FLUPS_WARNING("MEHR kernels in 2dirunbounded 1dirspectral entail an approximation in 3D.");
+            FLUPS_CHECK(unb_dirs_are_isotropic, "the grid has to be isotropic to use the MEHR kernels");
+            lgf_readfile_(MEHR_4L, 2, &GN, &Gdata);
+            G   = &zero_;
+            Gk0 = &mehr_4l6l_2unb0spe_;
+            Gr0 = &mehr_4l6l_2unb0spe_;
             break;
         case MEHR_6L:
-            FLUPS_CHECK(false, "MEHR6 kernel not available for 2D unbounded problems.");
+            FLUPS_WARNING("MEHR kernels in 2dirunbounded 1dirspectral entail an approximation in 3D.");
+            FLUPS_CHECK(unb_dirs_are_isotropic, "the grid has to be isotropic to use the MEHR kernels");
+            lgf_readfile_(MEHR_6L, 2, &GN, &Gdata);
+            G   = &zero_;
+            Gk0 = &mehr_4l6l_2unb0spe_;
+            Gr0 = &mehr_4l6l_2unb0spe_;
             break;
         case MEHR_4F:
-            FLUPS_CHECK(false, "MEHR4 kernel not available for 2D unbounded problems.");
+            FLUPS_WARNING("MEHR kernels in 2dirunbounded 1dirspectral entail an approximation in 3D.");
+            FLUPS_CHECK(unb_dirs_are_isotropic, "the grid has to be isotropic to use the MEHR kernels");
+            lgf_readfile_(MEHR_4F, 2, &GN, &Gdata);
+            G   = &zero_;
+            Gk0 = &mehr_4f_2unb0spe_;
+            Gr0 = &mehr_4f_2unb0spe_;
             break;
         case MEHR_6F:
-            FLUPS_CHECK(false, "MEHR6 kernel not available for 2D unbounded problems.");
+            FLUPS_WARNING("MEHR kernels in 2dirunbounded 1dirspectral entail an approximation in 3D.");
+            FLUPS_CHECK(unb_dirs_are_isotropic, "the grid has to be isotropic to use the MEHR kernels");
+            lgf_readfile_(MEHR_6F, 2, &GN, &Gdata);
+            G   = &zero_;
+            Gk0 = &mehr_6f_2unb0spe_;
+            Gr0 = &mehr_6f_2unb0spe_;
             break;
         default:
             FLUPS_CHECK(false, "Green Function type unknown.");
diff --git a/src/green_functions.hpp b/src/green_functions.hpp
index d8cb641..b5a9f70 100644
--- a/src/green_functions.hpp
+++ b/src/green_functions.hpp
@@ -61,8 +61,10 @@ static void lgf_readfile_(GreenType typeGreen, const int greendim, int* N, doubl
     } else if (greendim == 2) {
         (*N) = 32;
         datasize = (*N) * (*N);
-        if (MEHR_4L == typeGreen || MEHR_6L == typeGreen || MEHR_4F == typeGreen || MEHR_6F == typeGreen) {
-            FLUPS_CHECK(false, "MEHR kernels not implemented in 2D");
+        if (MEHR_4L == typeGreen || MEHR_6L == typeGreen) {
+            sprintf(lgfname, "%s/MEHR_4L6L_2d_%d.ker", path, (*N));
+        } else if (MEHR_4F == typeGreen || MEHR_6F == typeGreen) {
+            sprintf(lgfname, "%s/MEHR_%dF_2d_%d.ker", path, order, (*N));
         } else {
             sprintf(lgfname, "%s/LGF_%d_2d_%d.ker", path, order, (*N));
         }
diff --git a/src/green_kernels.hpp b/src/green_kernels.hpp
index 707e422..87f3e17 100644
--- a/src/green_kernels.hpp
+++ b/src/green_kernels.hpp
@@ -274,6 +274,60 @@ static inline double mehr_6f_3unb0spe_(const void* params,const double* data) {
     }
     return -green/(h);
 }
+static inline double mehr_4l6l_2unb0spe_(const void* params,const double* data) {
+    int    ix = (int)((double*)params)[4];
+    int    iy = (int)((double*)params)[5];
+    int    iz = (int)((double*)params)[6];
+    int    N  = (int)((double*)params)[7];
+    int    i1 = (ix == 0) ? iy : ix; // first nonzero index
+    int    i2 = (ix == 0) ? iz : (iy == 0) ? iz : iy; // second nonzero index
+
+    // if the point is close enough, it will be already precomputed
+    double green;
+    if (i1 < N && i2 < N) {
+        green = data[i1 + i2 * N];
+    } else {  // if not, we use the extrapolation
+        const double rho = sqrt(i1 * i1 + i2 * i2);
+        mehr_4l6l_2unb0spe_expansion(&green, i1, i2, rho);
+    }
+    return -green;
+}
+static inline double mehr_4f_2unb0spe_(const void* params,const double* data) {
+    int    ix = (int)((double*)params)[4];
+    int    iy = (int)((double*)params)[5];
+    int    iz = (int)((double*)params)[6];
+    int    N  = (int)((double*)params)[7];
+    int    i1 = (ix == 0) ? iy : ix; // first nonzero index
+    int    i2 = (ix == 0) ? iz : (iy == 0) ? iz : iy; // second nonzero index
+
+    // if the point is close enough, it will be already precomputed
+    double green;
+    if (i1 < N && i2 < N) {
+        green = data[i1 + i2 * N];
+    } else {  // if not, we use the extrapolation
+        const double rho = sqrt(i1 * i1 + i2 * i2);
+        mehr_4f_2unb0spe_expansion(&green, i1, i2, rho);
+    }
+    return -green;
+}
+static inline double mehr_6f_2unb0spe_(const void* params,const double* data) {
+    int    ix = (int)((double*)params)[4];
+    int    iy = (int)((double*)params)[5];
+    int    iz = (int)((double*)params)[6];
+    int    N  = (int)((double*)params)[7];
+    int    i1 = (ix == 0) ? iy : ix; // first nonzero index
+    int    i2 = (ix == 0) ? iz : (iy == 0) ? iz : iy; // second nonzero index
+
+    // if the point is close enough, it will be already precomputed
+    double green;
+    if (i1 < N && i2 < N) {
+        green = data[i1 + i2 * N];
+    } else {  // if not, we use the extrapolation
+        const double rho = sqrt(i1 * i1 + i2 * i2);
+        mehr_6f_2unb0spe_expansion(&green, i1, i2, rho);
+    }
+    return -green;
+}
 /**@} */
 
 
diff --git a/src/lgf_3unb0spe.hpp b/src/lgf_3unb0spe.hpp
index 3f0e9b0..c626d16 100644
--- a/src/lgf_3unb0spe.hpp
+++ b/src/lgf_3unb0spe.hpp
@@ -314,4 +314,62 @@ void mehr_6f_3unb0spe_expansion(double* G, const double n1, const double n2, con
   G[0] = (-0.018229166666666668 * n1_12 + -0.018229166666666668 * n2_12 + -0.018229166666666668 * n3_12 + -0.5625 * (n1 * n1) * n2_10 + 8.088541666666666 * n1_6 * n2_6 + 3.5 * n1_4 * n2_8 + 8.088541666666666 * n1_6 * n3_6 + -0.5625 * (n1 * n1) * n3_10 + 3.5 * n1_8 * n2_4 + 3.5 * n1_8 * n3_4 + -0.5625 * n1_10 * (n2 * n2) + 3.5 * n1_4 * n3_8 + -0.5625 * n1_10 * (n3 * n3) + 3.5 * n2_4 * n3_8 + 3.5 * n2_8 * n3_4 + 8.088541666666666 * n2_6 * n3_6 + -0.5625 * n2_10 * (n3 * n3) + -0.5625 * (n2 * n2) * n3_10 + -15.875 * n1_6 * n2_4 * (n3 * n3) + -9.953125 * n1_8 * (n2 * n2) * (n3 * n3) + -15.875 * n1_4 * (n2 * n2) * n3_6 + -15.875 * n1_6 * (n2 * n2) * n3_4 + -33.1640625 * n1_4 * n2_4 * n3_4 + -15.875 * n1_4 * n2_6 * (n3 * n3) + -9.953125 * (n1 * n1) * (n2 * n2) * n3_8 + -15.875 * (n1 * n1) * n2_6 * n3_4 + -15.875 * (n1 * n1) * n2_4 * n3_6 + -9.953125 * (n1 * n1) * n2_8 * (n3 * n3)) / (M_PI * pow(n, 19)) + (-0.67265625 * n1_16 + -0.67265625 * n2_16 + -0.67265625 * n3_16 + -27.74296875 * n1_10 * n2_6 + 7.79296875 * (n1 * n1) * n2_14 + -65.1328125 * n1_8 * n2_8 + 13.2890625 * n1_4 * n2_12 + 13.2890625 * n1_12 * n3_4 + -65.1328125 * n2_8 * n3_8 + -27.74296875 * n1_6 * n3_10 + -65.1328125 * n1_8 * n3_8 + 7.79296875 * n1_14 * (n3 * n3) + 13.2890625 * n1_12 * n2_4 + -27.74296875 * n2_10 * n3_6 + -27.74296875 * n2_6 * n3_10 + 7.79296875 * (n1 * n1) * n3_14 + 13.2890625 * n1_4 * n3_12 + 13.2890625 * n2_4 * n3_12 + 7.79296875 * (n2 * n2) * n3_14 + -27.74296875 * n1_6 * n2_10 + -27.74296875 * n1_10 * n3_6 + 7.79296875 * n1_14 * (n2 * n2) + 13.2890625 * n2_12 * n3_4 + 7.79296875 * n2_14 * (n3 * n3) + -98.109375 * (n1 * n1) * n2_8 * n3_6 + 13.9453125 * (n1 * n1) * n2_12 * (n3 * n3) + 13.9453125 * n1_12 * (n2 * n2) * (n3 * n3) + -27.64453125 * (n1 * n1) * n2_4 * n3_10 + 86.54296875 * n1_4 * n2_8 * n3_4 + -98.109375 * n1_8 * (n2 * n2) * n3_6 + -98.109375 * n1_8 * n2_6 * (n3 * n3) + 13.9453125 * (n1 * n1) * (n2 * n2) * n3_12 + -27.64453125 * n1_10 * n2_4 * (n3 * n3) + -27.64453125 * (n1 * n1) * n2_10 * n3_4 + 254.953125 * n1_6 * n2_4 * n3_6 + -98.109375 * n1_6 * n2_8 * (n3 * n3) + -98.109375 * n1_6 * (n2 * n2) * n3_8 + -27.64453125 * n1_4 * n2_10 * (n3 * n3) + 86.54296875 * n1_4 * n2_4 * n3_8 + 86.54296875 * n1_8 * n2_4 * n3_4 + 254.953125 * n1_6 * n2_6 * n3_4 + 254.953125 * n1_4 * n2_6 * n3_6 + -27.64453125 * n1_4 * (n2 * n2) * n3_10 + -27.64453125 * n1_10 * (n2 * n2) * n3_4 + -98.109375 * (n1 * n1) * n2_6 * n3_8) / (M_PI * pow(n, 25)) + 1 / (12.566370614359172 * n * 1);
 }
 
+void mehr_4l6l_2unb0spe_expansion(double* G, const double n1, const double n2, const double n) {
+  const double n1_4 = pow(n1, 4);
+  const double n1_6 = pow(n1, 6);
+  const double n1_8 = pow(n1, 8);
+  const double n1_10 = pow(n1, 10);
+  const double n1_12 = pow(n1, 12);
+  const double n1_14 = pow(n1, 14);
+  const double n1_16 = pow(n1, 16);
+  const double n2_4 = pow(n2, 4);
+  const double n2_6 = pow(n2, 6);
+  const double n2_8 = pow(n2, 8);
+  const double n2_10 = pow(n2, 10);
+  const double n2_12 = pow(n2, 12);
+  const double n2_14 = pow(n2, 14);
+  const double n2_16 = pow(n2, 16);
+  G[0] = (0.02976190476190476 * n1_12 + -0.7738095238095238 * n1_10 * (n2 * n2) + 0.44642857142857145 * n1_8 * n2_4 + 2.5 * n1_6 * n2_6 + 0.44642857142857145 * n1_4 * n2_8 + -0.7738095238095238 * (n1 * n1) * n2_10 + 0.02976190476190476 * n2_12) / (M_PI * pow(n, 18)) + (0.175 * n1_16 + -4.2 * n1_14 * (n2 * n2) + -6.3 * n1_12 * n2_4 + 15.4 * n1_10 * n2_6 + 34.65 * n1_8 * n2_8 + 15.4 * n1_6 * n2_10 + -6.3 * n1_4 * n2_12 + -4.2 * (n1 * n1) * n2_14 + 0.175 * n2_16) / (M_PI * pow(n, 24)) + (0.016666666666666666 * n1_8 + -0.06666666666666667 * n1_6 * (n2 * n2) + -0.16666666666666666 * n1_4 * n2_4 + -0.06666666666666667 * (n1 * n1) * n2_6 + 0.016666666666666666 * n2_8) / (M_PI * pow(n, 12)) + -0.15915494309189535 * (1.6169364357414509 + log(n));
+}
+
+void mehr_4f_2unb0spe_expansion(double* G, const double n1, const double n2, const double n) {
+  const double n1_4 = pow(n1, 4);
+  const double n1_6 = pow(n1, 6);
+  const double n1_8 = pow(n1, 8);
+  const double n1_10 = pow(n1, 10);
+  const double n1_12 = pow(n1, 12);
+  const double n1_14 = pow(n1, 14);
+  const double n1_16 = pow(n1, 16);
+  const double n1_18 = pow(n1, 18);
+  const double n1_20 = pow(n1, 20);
+  const double n2_4 = pow(n2, 4);
+  const double n2_6 = pow(n2, 6);
+  const double n2_8 = pow(n2, 8);
+  const double n2_10 = pow(n2, 10);
+  const double n2_12 = pow(n2, 12);
+  const double n2_14 = pow(n2, 14);
+  const double n2_16 = pow(n2, 16);
+  const double n2_18 = pow(n2, 18);
+  const double n2_20 = pow(n2, 20);
+  G[0] = (0.32083333333333336 * n1_16 + -7.7 * n1_14 * (n2 * n2) + -11.55 * n1_12 * n2_4 + 28.233333333333334 * n1_10 * n2_6 + 63.525 * n1_8 * n2_8 + 28.233333333333334 * n1_6 * n2_10 + -11.55 * n1_4 * n2_12 + -7.7 * (n1 * n1) * n2_14 + 0.32083333333333336 * n2_16) / (M_PI * pow(n, 24)) + (3.712121212121212 * n1_20 + -230.15151515151516 * n1_18 * (n2 * n2) + 879.7727272727273 * n1_16 * n2_4 + 2464.848484848485 * n1_14 * n2_6 + -1833.7878787878788 * n1_12 * n2_8 + -6370.0 * n1_10 * n2_10 + -1833.7878787878788 * n1_8 * n2_12 + 2464.848484848485 * n1_6 * n2_14 + 879.7727272727273 * n1_4 * n2_16 + -230.15151515151516 * (n1 * n1) * n2_18 + 3.712121212121212 * n2_20) / (M_PI * pow(n, 30)) + (0.02976190476190476 * n1_12 + -0.7738095238095238 * n1_10 * (n2 * n2) + 0.44642857142857145 * n1_8 * n2_4 + 2.5 * n1_6 * n2_6 + 0.44642857142857145 * n1_4 * n2_8 + -0.7738095238095238 * (n1 * n1) * n2_10 + 0.02976190476190476 * n2_12) / (M_PI * pow(n, 18)) + (0.058333333333333334 * n1_8 + -0.23333333333333334 * n1_6 * (n2 * n2) + -0.5833333333333334 * n1_4 * n2_4 + -0.23333333333333334 * (n1 * n1) * n2_6 + 0.058333333333333334 * n2_8) / (M_PI * pow(n, 12)) + -0.15915494309189535 * (1.6169364357414509 + log(n));
+}
+
+void mehr_6f_2unb0spe_expansion(double* G, const double n1, const double n2, const double n) {
+  const double n1_4 = pow(n1, 4);
+  const double n1_6 = pow(n1, 6);
+  const double n1_8 = pow(n1, 8);
+  const double n1_10 = pow(n1, 10);
+  const double n1_12 = pow(n1, 12);
+  const double n1_14 = pow(n1, 14);
+  const double n1_16 = pow(n1, 16);
+  const double n2_4 = pow(n2, 4);
+  const double n2_6 = pow(n2, 6);
+  const double n2_8 = pow(n2, 8);
+  const double n2_10 = pow(n2, 10);
+  const double n2_12 = pow(n2, 12);
+  const double n2_14 = pow(n2, 14);
+  const double n2_16 = pow(n2, 16);
+  G[0] = (0.02976190476190476 * n1_12 + -0.7738095238095238 * n1_10 * (n2 * n2) + 0.44642857142857145 * n1_8 * n2_4 + 2.5 * n1_6 * n2_6 + 0.44642857142857145 * n1_4 * n2_8 + -0.7738095238095238 * (n1 * n1) * n2_10 + 0.02976190476190476 * n2_12) / (M_PI * pow(n, 18)) + (-0.175 * n1_16 + 4.2 * n1_14 * (n2 * n2) + 6.3 * n1_12 * n2_4 + -15.4 * n1_10 * n2_6 + -34.65 * n1_8 * n2_8 + -15.4 * n1_6 * n2_10 + 6.3 * n1_4 * n2_12 + 4.2 * (n1 * n1) * n2_14 + -0.175 * n2_16) / (M_PI * pow(n, 24)) + -0.15915494309189535 * (1.6169364357414509 + log(n));
+}
+
 #endif
\ No newline at end of file