From bbabecf34f1c136ee1f83d226524a657696472e6 Mon Sep 17 00:00:00 2001
From: Charles-Antoine Leger <ca@leger.email>
Date: Mon, 7 Oct 2024 11:27:16 +0200
Subject: [PATCH] feat[poisson]: Support 3D mesh with CSR GPU + tests

---
 poisson/CMakeLists.txt             |   2 +
 poisson/CsrGpuBiliAssembly.cc      | 131 ++++++++++++++++++++++++-----
 poisson/FemModule.cc               |  11 ++-
 poisson/FemModule.h                |  81 +++++++++++++++++-
 poisson/Test.sphere.3D.csr-gpu.arc |  43 ++++++++++
 5 files changed, 244 insertions(+), 24 deletions(-)
 create mode 100644 poisson/Test.sphere.3D.csr-gpu.arc
diff --git a/poisson/CMakeLists.txt b/poisson/CMakeLists.txt
index 01d574f..53c3956 100644
--- a/poisson/CMakeLists.txt
+++ b/poisson/CMakeLists.txt
@@ -30,6 +30,7 @@ configure_file(Test.L-shape.3D.arc ${CMAKE_CURRENT_BINARY_DIR} COPYONLY)
 configure_file(Test.sphere.3D.arc ${CMAKE_CURRENT_BINARY_DIR} COPYONLY)
 configure_file(Test.sphere.3D.csr.arc ${CMAKE_CURRENT_BINARY_DIR} COPYONLY)
 configure_file(Test.sphere.3D.csr.no-edge.arc ${CMAKE_CURRENT_BINARY_DIR} COPYONLY)
+configure_file(Test.sphere.3D.csr-gpu.arc ${CMAKE_CURRENT_BINARY_DIR} COPYONLY)
 configure_file(Test.direct-solver.arc ${CMAKE_CURRENT_BINARY_DIR} COPYONLY)
 configure_file(Test.neumann.arc ${CMAKE_CURRENT_BINARY_DIR} COPYONLY)
 configure_file(Test.porous.arc ${CMAKE_CURRENT_BINARY_DIR} COPYONLY)
@@ -94,6 +95,7 @@ endif()
 
 
 arcanefem_add_gpu_test(NAME [poisson]gpu COMMAND ./Poisson ARGS Test.petsc.arc)
+arcanefem_add_gpu_test(NAME [poisson]hypre_direct_3D_csr-gpu COMMAND ./Poisson ARGS Test.sphere.3D.csr-gpu.arc)
 if(FEMUTILS_HAS_SOLVER_BACKEND_HYPRE)
   arcanefem_add_gpu_test(NAME [poisson]hypre_direct_gpu COMMAND ./Poisson ARGS Test.hypre_direct.arc)
   arcanefem_add_gpu_test(NAME [poisson]hypre_direct_3D_gpu COMMAND ./Poisson ARGS Test.sphere.3D.arc)
diff --git a/poisson/CsrGpuBiliAssembly.cc b/poisson/CsrGpuBiliAssembly.cc
index 48dc341..8d3ec0e 100644
--- a/poisson/CsrGpuBiliAssembly.cc
+++ b/poisson/CsrGpuBiliAssembly.cc
@@ -14,33 +14,20 @@
 
 #include "FemModule.h"
 
-/**
- * @brief Initialization of the csr matrix. It only works for p=1 since there is
- * one node per Edge. Currently, there is no difference between buildMatrixCsr and this method.
- * 
- * 
- */
+/*---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------*/
+
 void FemModule::
 _buildMatrixCsrGPU()
 {
-  //Initialization of the csr matrix;
-  //This formula only works in p=1
-
-  /*
-  //Create a connection between nodes through the faces
-  //Useless here because we only need this information once
-  IItemFamily* node_family = mesh()->nodeFamily();
-  NodeGroup nodes = node_family->allItems();
-  auto idx_cn = mesh()->indexedConnectivityMng()->findOrCreateConnectivity(node_family, node_family, "NodeToNeighbourFaceNodes");
-  auto* cn = idx_cn->connectivity();
-  ENUMERATE_NODE (node, allNodes()) {
-  }
-  */
+  auto node_dof(m_dofs_on_nodes.nodeDoFConnectivityView());
+
+  Int32 nnz = (options()->meshType == "TRIA3" ? nbFace() : nbEdge() * 2) + nbNode();
 
-  Int32 nnz = nbFace() * 2 + nbNode();
   m_csr_matrix.initialize(m_dof_family, nnz, nbNode());
-  auto node_dof(m_dofs_on_nodes.nodeDoFConnectivityView());
+
   //We iterate through the node, and we do not sort anymore : we assume the nodes ID are sorted, and we will iterate throught the column to avoid making < and > comparison
+  if (options()->meshType == "TRIA3") {
   ENUMERATE_NODE (inode, allNodes()) {
     Node node = *inode;
     Int32 node_dof_id = node_dof.dofId(node, 0);
@@ -55,9 +42,36 @@ _buildMatrixCsrGPU()
         m_csr_matrix.setCoordinates(diagonal_entry, node_dof.dofId(face.nodeId(0), 0));
     }
   }
+  }
+  else if (options()->meshType == "TETRA4") {
+  ENUMERATE_NODE (inode, allNodes()) {
+    Node node = *inode;
+    Int32 node_dof_id = node_dof.dofId(node, 0);
+    ItemLocalIdT<DoF> diagonal_entry(node_dof_id);
+
+    m_csr_matrix.setCoordinates(diagonal_entry, diagonal_entry);
+
+    for (Edge edge : node.edges()) {
+      if (edge.nodeId(0) == node.localId())
+        m_csr_matrix.setCoordinates(diagonal_entry, node_dof.dofId(edge.nodeId(1), 0));
+      else
+        m_csr_matrix.setCoordinates(diagonal_entry, node_dof.dofId(edge.nodeId(0), 0));
+    }
+  }
+  }
 }
 
 /*---------------------------------------------------------------------------*/
+/**
+ * @brief Assembles the bilinear operator matrix for the FEM linear system with
+ * the CSR sparse matrix format for TRIA3 elements.
+ *
+ * The method performs the following steps:
+ *   1. Builds the CSR matrix using _buildMatrixCsrGPU.
+ *   2. For each cell, computes element matrix using _computeElementMatrixTRIA3GPU.
+ *   3. Assembles global matrix by adding contributions from each cell's element 
+ *      matrix to the corresponding entries in the global matrix.
+ */
 /*---------------------------------------------------------------------------*/
 
 void FemModule::
@@ -136,4 +150,79 @@ _assembleCsrGPUBilinearOperatorTRIA3()
 }
 
 /*---------------------------------------------------------------------------*/
+/**
+ * @brief Assembles the bilinear operator matrix for the FEM linear system with
+ * the CSR sparse matrix format for TETRA4 elements.
+ *
+ * The method performs the following steps:
+ *   1. Builds the CSR matrix using _buildMatrixCsrGPU.
+ *   2. For each cell, computes element matrix using _computeElementMatrixTETRA4GPU.
+ *   3. Assembles global matrix by adding contributions from each cell's element 
+ *      matrix to the corresponding entries in the global matrix.
+ */
+/*---------------------------------------------------------------------------*/
+
+void FemModule::
+_assembleCsrGPUBilinearOperatorTETRA4()
+{
+  Timer::Action timer_gpu_bili(m_time_stats, "AssembleCsrGpuBilinearOperatorTetra4");
+
+  {
+    Timer::Action timer_gpu_build(m_time_stats, "CsrGpuBuildMatrix");
+    _buildMatrixCsrGPU();
+  }
+
+  RunQueue* queue = acceleratorMng()->defaultQueue();
+  auto command = makeCommand(queue);
+
+  auto node_dof(m_dofs_on_nodes.nodeDoFConnectivityView());
+  auto in_row_csr = ax::viewIn(command, m_csr_matrix.m_matrix_row);
+  Int32 row_csr_size = m_csr_matrix.m_matrix_row.dim1Size();
+  auto in_col_csr = ax::viewIn(command, m_csr_matrix.m_matrix_column);
+  Int32 col_csr_size = m_csr_matrix.m_matrix_column.dim1Size();
+  auto in_out_val_csr = ax::viewInOut(command, m_csr_matrix.m_matrix_value);
+  UnstructuredMeshConnectivityView m_connectivity_view;
+  auto in_node_coord = ax::viewIn(command, m_node_coord);
+  m_connectivity_view.setMesh(this->mesh());
+  auto cnc = m_connectivity_view.cellNode();
+  Arcane::ItemGenericInfoListView nodes_infos(this->mesh()->nodeFamily());
+
+  Timer::Action timer_add_compute(m_time_stats, "CsrGpuAddComputeLoop");
+
+  command << RUNCOMMAND_ENUMERATE(Cell, icell, allCells())
+  {
+
+    Real K_e[16] = { 0 };
+
+    _computeElementMatrixTETRA4GPU(icell, cnc, in_node_coord, K_e);
+
+    Int32 n1_index = 0;
+    for (NodeLocalId node1 : cnc.nodes(icell)) {
+      Int32 n2_index = 0;
+      for (NodeLocalId node2 : cnc.nodes(icell)) {
+        double v = K_e[n1_index * 4 + n2_index];
+
+        if (nodes_infos.isOwn(node1)) {
+
+          Int32 row = node_dof.dofId(node1, 0).localId();
+          Int32 col = node_dof.dofId(node2, 0).localId();
+          Int32 begin = in_row_csr[row];
+          Int32 end = (row == row_csr_size - 1) ? col_csr_size : in_row_csr[row + 1];
+
+          while (begin < end) {
+            if (in_col_csr[begin] == col) {
+              ax::doAtomic<ax::eAtomicOperation::Add>(in_out_val_csr(begin), v);
+              break;
+            }
+            begin++;
+          }
+        }
+        ++n2_index;
+      }
+      ++n1_index;
+    }
+  };
+}
+
 /*---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------*/
\ No newline at end of file
diff --git a/poisson/FemModule.cc b/poisson/FemModule.cc
index 99bce2e..5dd8b53 100644
--- a/poisson/FemModule.cc
+++ b/poisson/FemModule.cc
@@ -456,12 +456,19 @@ _doStationarySolve()
 #ifdef ARCANE_HAS_ACCELERATOR
   if (m_use_csr_gpu) {
     m_linear_system.clearValues();
-    _assembleCsrGPUBilinearOperatorTRIA3();
+    if (options()->meshType == "TRIA3")
+      _assembleCsrGPUBilinearOperatorTRIA3();
+    if (options()->meshType == "TETRA4")
+      _assembleCsrGPUBilinearOperatorTETRA4();
     if (m_cache_warming != 1) {
       m_time_stats->resetStats("AssembleCsrGpuBilinearOperatorTria3");
+      m_time_stats->resetStats("AssembleCsrGpuBilinearOperatorTetra4");
       for (cache_index = 1; cache_index < m_cache_warming; cache_index++) {
         m_linear_system.clearValues();
-        _assembleCsrGPUBilinearOperatorTRIA3();
+        if (options()->meshType == "TRIA3")
+          _assembleCsrGPUBilinearOperatorTRIA3();
+        if (options()->meshType == "TETRA4")
+          _assembleCsrGPUBilinearOperatorTETRA4();
       }
     }
      m_csr_matrix.translateToLinearSystem(m_linear_system);
diff --git a/poisson/FemModule.h b/poisson/FemModule.h
index b0f2198..2854081 100644
--- a/poisson/FemModule.h
+++ b/poisson/FemModule.h
@@ -223,6 +223,7 @@ class FemModule
 
   void _applyDirichletBoundaryConditionsGpu();
   void _assembleCsrGpuLinearOperator();
+
   static ARCCORE_HOST_DEVICE Int32
   _getValIndexCsrGpu(Int32 begin, Int32 end, DoFLocalId col, ax::NumArrayView<DataViewGetter<Int32>, MDDim1, DefaultLayout> csr_col);
 
@@ -255,6 +256,10 @@ class FemModule
   _computeElementMatrixTRIA3GPU(CellLocalId icell, IndexedCellNodeConnectivityView cnc,
                                 ax::VariableNodeReal3InView in_node_coord, Real K_e[9]);
 
+  static ARCCORE_HOST_DEVICE inline void
+  _computeElementMatrixTETRA4GPU(CellLocalId icell, IndexedCellNodeConnectivityView cnc,
+                                 ax::VariableNodeReal3InView in_node_coord, Real K_e[12]);
+
  private:
 
 
@@ -262,6 +267,7 @@ class FemModule
 
   void _buildMatrixCsrGPU();
   void _assembleCsrGPUBilinearOperatorTRIA3();
+  void _assembleCsrGPUBilinearOperatorTETRA4();
 
  private:
 
@@ -364,4 +370,77 @@ _computeElementMatrixTRIA3GPU(CellLocalId icell, IndexedCellNodeConnectivityView
   //return int_cdPi_dPj;
 }
 
-#endif
+/*---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------*/
+
+ARCCORE_HOST_DEVICE inline void FemModule::
+_computeElementMatrixTETRA4GPU(CellLocalId icell, IndexedCellNodeConnectivityView cnc,
+                               ax::VariableNodeReal3InView in_node_coord, Real K_e[16])
+{
+  // Get coordinates of the triangle element  TETRA4
+  //------------------------------------------------
+  //                3 o
+  //                 /|\
+  //                / | \
+  //               /  |  \
+  //              /   o 2 \
+  //             / .    .  \
+  //            o-----------o
+  //            0           1
+  //------------------------------------------------
+  Real3 m0 = in_node_coord[cnc.nodeId(icell, 0)];
+  Real3 m1 = in_node_coord[cnc.nodeId(icell, 1)];
+  Real3 m2 = in_node_coord[cnc.nodeId(icell, 2)];
+  Real3 m3 = in_node_coord[cnc.nodeId(icell, 3)];
+
+  Real3 v0 = m1 - m0;
+  Real3 v1 = m2 - m0;
+  Real3 v2 = m3 - m0;
+
+  Real volume = std::abs(Arcane::math::dot(v0, Arcane::math::cross(v1, v2))) / 6.0;
+
+  // Compute gradients of shape functions
+  Real3 dPhi0 = Arcane::math::cross(m2 - m1, m1 - m3);
+  Real3 dPhi1 = Arcane::math::cross(m3 - m0, m0 - m2);
+  Real3 dPhi2 = Arcane::math::cross(m1 - m0, m0 - m3);
+  Real3 dPhi3 = Arcane::math::cross(m0 - m1, m1 - m2);
+
+  Real mul = 1.0 / (6.0 * volume);
+
+  // Construct the B-matrix
+  Real b_matrix[3][4];
+  b_matrix[0][0] = dPhi0.x;
+  b_matrix[1][0] = dPhi0.y;
+  b_matrix[2][0] = dPhi0.z;
+
+  b_matrix[0][1] = dPhi1.x;
+  b_matrix[1][1] = dPhi1.y;
+  b_matrix[2][1] = dPhi1.z;
+
+  b_matrix[0][2] = dPhi2.x;
+  b_matrix[1][2] = dPhi2.y;
+  b_matrix[2][2] = dPhi2.z;
+
+  b_matrix[0][3] = dPhi3.x;
+  b_matrix[1][3] = dPhi3.y;
+  b_matrix[2][3] = dPhi3.z;
+
+  for (Int32 i = 0; i < 3; ++i)
+    for (Int32 j = 0; j < 4; ++j)
+      b_matrix[i][j] *= mul;
+
+  // Compute the element matrix
+  for (Int32 i = 0; i < 4; ++i) {
+    for (Int32 j = i; j < 4; ++j) {
+      for (Int32 k = 0; k < 3; ++k)
+        K_e[i * 4 + j] += b_matrix[k][i] * b_matrix[k][j];
+      K_e[i * 4 + j] *= volume;
+      K_e[j * 4 + i] = K_e[i * 4 + j];
+    }
+  }
+}
+
+/*---------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------*/
+
+#endif
\ No newline at end of file
diff --git a/poisson/Test.sphere.3D.csr-gpu.arc b/poisson/Test.sphere.3D.csr-gpu.arc
new file mode 100644
index 0000000..5c45376
--- /dev/null
+++ b/poisson/Test.sphere.3D.csr-gpu.arc
@@ -0,0 +1,43 @@
+<?xml version="1.0"?>
+<case codename="Poisson" xml:lang="en" codeversion="1.0">
+  <arcane>
+    <title>Sphere 3D with CSR sparse matrix format Gpu compatible. The result of this test is compared with test_sphere_3D_results.txt</title>
+    <timeloop>PoissonLoop</timeloop>
+  </arcane>
+
+  <arcane-post-processing>
+   <output-period>1</output-period>
+   <format name="VtkHdfV2PostProcessor" />
+   <output>
+     <variable>U</variable>
+   </output>
+  </arcane-post-processing>
+
+  <meshes>
+    <mesh>
+      <filename>sphere_cut.msh</filename>
+    </mesh>
+  </meshes>
+
+  <fem>
+    <result-file>test_sphere_3D_results.txt</result-file>
+    <f>-0.01</f>
+    <mesh-type>TETRA4</mesh-type>
+    <enforce-Dirichlet-method>Penalty</enforce-Dirichlet-method>
+    <penalty>1.e31</penalty>
+    <dirichlet-boundary-condition>
+      <surface>sphere</surface>
+      <value>0.0</value>
+    </dirichlet-boundary-condition>
+    <dirichlet-boundary-condition>
+      <surface>Cut</surface>
+      <value>10.0</value>
+    </dirichlet-boundary-condition>
+    <linear-system name="HypreLinearSystem">
+      <rtol>0.</rtol>
+      <atol>1e-5</atol>
+      <amg-threshold>0.55</amg-threshold>
+    </linear-system>
+    <csr-gpu>true</csr-gpu>
+  </fem>
+</case>