From 5ca60f954a0694d053bfabd0b6babb2459da6921 Mon Sep 17 00:00:00 2001
From: Jian Sun <sunjian@ucar.edu>
Date: Thu, 3 Oct 2024 12:03:08 -0600
Subject: [PATCH 01/12] remove gpus options from python argument list; only do
 sanity check ngpus_per_node

---
 CIME/XML/env_mach_pes.py                      | 23 +++++++---
 CIME/case/case.py                             | 42 ++++---------------
 .../config/xml_schemas/config_machines.xsd    | 10 -----
 .../config/xml_schemas/env_mach_specific.xsd  |  4 --
 CIME/scripts/create_newcase.py                | 28 -------------
 CIME/test_scheduler.py                        | 12 ------
 CIME/tests/test_unit_case.py                  |  6 ---
 7 files changed, 26 insertions(+), 99 deletions(-)

diff --git a/CIME/XML/env_mach_pes.py b/CIME/XML/env_mach_pes.py
index 76c6588901b..61402aa9686 100644
--- a/CIME/XML/env_mach_pes.py
+++ b/CIME/XML/env_mach_pes.py
@@ -44,6 +44,7 @@ def get_value(
         max_mpitasks_per_node=None,
         max_cputasks_per_gpu_node=None,
         ngpus_per_node=None,
+        oversubscribe_gpu=False
     ):  # pylint: disable=arguments-differ
         # Special variable NINST_MAX is used to determine the number of
         # drivers in multi-driver mode.
@@ -177,17 +178,27 @@ def get_tasks_per_node(self, total_tasks, max_thread_count):
         )
         if self._comp_interface == "nuopc" and self.get_value("ESMF_AWARE_THREADING"):
             if self.get_value("NGPUS_PER_NODE") > 0:
-                tasks_per_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
+                if self.get_value("OVERSUBSCRIBE_GPU"):
+                    tasks_per_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
+                else:
+                    tasks_per_node = self.get_value("NGPUS_PER_NODE")
             else:
                 tasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
         else:
             ngpus_per_node = self.get_value("NGPUS_PER_NODE")
             if ngpus_per_node and ngpus_per_node > 0:
-                tasks_per_node = min(
-                    self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
-                    self.get_value("MAX_CPUTASKS_PER_GPU_NODE"),
-                    total_tasks,
-                )
+                if self.get_value("OVERSUBSCRIBE_GPU"):
+                    tasks_per_node = min(
+                        self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
+                        self.get_value("MAX_CPUTASKS_PER_GPU_NODE"),
+                        total_tasks,
+                    )
+                else:
+                    tasks_per_node = min(
+                        self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
+                        self.get_value("NGPUS_PER_NODE"),
+                        total_tasks,
+                    ) 
             else:
                 tasks_per_node = min(
                     self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
diff --git a/CIME/case/case.py b/CIME/case/case.py
index 6f9082b0aa4..82d5baacdc3 100644
--- a/CIME/case/case.py
+++ b/CIME/case/case.py
@@ -1301,9 +1301,6 @@ def configure(
         non_local=False,
         extra_machines_dir=None,
         case_group=None,
-        ngpus_per_node=0,
-        gpu_type=None,
-        gpu_offload=None,
     ):
         expect(
             check_name(compset_name, additional_chars="."),
@@ -1563,12 +1560,15 @@ def configure(
 
         # ----------------------------------------------------------------------------------------------------------
         # Sanity check for a GPU run:
-        #        1. GPU_TYPE and GPU_OFFLOAD must both be defined to use GPUS
-        #        2. if ngpus_per_node argument is larger than the value of MAX_GPUS_PER_NODE, the NGPUS_PER_NODE
-        #             XML variable in the env_mach_pes.xml file would be set to MAX_GPUS_PER_NODE automatically.
-        #        3. if ngpus-per-node argument is equal to 0, it will be updated to 1 automatically.
+        #        1. GPU_TYPE and GPU_OFFLOAD must both be defined to use GPUs
+        #        2. If the NGPUS_PER_NODE XML variable in the env_mach_pes.xml file is larger than 
+        #           the value of MAX_GPUS_PER_NODE, set it to MAX_GPUS_PER_NODE automatically.
+        #        3. If the NGPUS_PER_NODE XML variable is equal to 0, it will be updated to 1 automatically.
         # ----------------------------------------------------------------------------------------------------------
         max_gpus_per_node = self.get_value("MAX_GPUS_PER_NODE")
+        gpu_type = self.get_value("GPU_TYPE")
+        gpu_offload = self.get_value("GPU_OFFLOAD")
+        ngpus_per_node = self.get_value("NGPUS_PER_NODE")
         if gpu_type and str(gpu_type).lower() != "none":
             expect(
                 max_gpus_per_node,
@@ -1579,20 +1579,8 @@ def configure(
                 "Both gpu-type and gpu-offload must be defined if either is defined",
             )
             expect(
-                compiler in ["nvhpc", "cray"],
-                f"Only nvhpc and cray compilers are expected for a GPU run; the user given compiler is {compiler}, ",
-            )
-            valid_gpu_type = self.get_value("GPU_TYPE").split(",")
-            valid_gpu_type.remove("none")
-            expect(
-                gpu_type in valid_gpu_type,
-                f"Unsupported GPU type is given: {gpu_type} ; valid values are {valid_gpu_type}",
-            )
-            valid_gpu_offload = self.get_value("GPU_OFFLOAD").split(",")
-            valid_gpu_offload.remove("none")
-            expect(
-                gpu_offload in valid_gpu_offload,
-                f"Unsupported GPU programming model is given: {gpu_offload} ; valid values are {valid_gpu_offload}",
+                compiler in ["gnu", "nvhpc", "cray"],
+                f"Only gnu, nvhpc and cray compilers are expected for a GPU run; the user given compiler is {compiler}, ",
             )
             self.gpu_enabled = True
             if ngpus_per_node >= 0:
@@ -1613,12 +1601,6 @@ def configure(
                 f"ngpus_per_node is expected to be 0 for a pure CPU run ; {ngpus_per_node} is provided instead ;",
             )
 
-        # Set these two GPU XML variables here to overwrite the default values
-        # Only set them for "cesm" model
-        if self._cime_model == "cesm":
-            self.set_value("GPU_TYPE", str(gpu_type).lower())
-            self.set_value("GPU_OFFLOAD", str(gpu_offload).lower())
-
         self.initialize_derived_attributes()
 
         # --------------------------------------------
@@ -2440,9 +2422,6 @@ def create(
         non_local=False,
         extra_machines_dir=None,
         case_group=None,
-        ngpus_per_node=0,
-        gpu_type=None,
-        gpu_offload=None,
     ):
         try:
             # Set values for env_case.xml
@@ -2515,9 +2494,6 @@ def create(
                 non_local=non_local,
                 extra_machines_dir=extra_machines_dir,
                 case_group=case_group,
-                ngpus_per_node=ngpus_per_node,
-                gpu_type=gpu_type,
-                gpu_offload=gpu_offload,
             )
 
             self.create_caseroot()
diff --git a/CIME/data/config/xml_schemas/config_machines.xsd b/CIME/data/config/xml_schemas/config_machines.xsd
index b025c4039e0..70f6f155d1e 100644
--- a/CIME/data/config/xml_schemas/config_machines.xsd
+++ b/CIME/data/config/xml_schemas/config_machines.xsd
@@ -6,8 +6,6 @@
   <xs:attribute name="compiler" type="xs:string"/>
   <xs:attribute name="mpilib" type="xs:string"/>
   <xs:attribute name="comp_interface" type="xs:string"/>
-  <xs:attribute name="gpu_type" type="xs:string"/>
-  <xs:attribute name="gpu_offload" type="xs:string"/>
   <xs:attribute name="queue" type="xs:string"/>
   <xs:attribute name="DEBUG" type="upperBoolean"/>
   <xs:attribute name="PIO_VERSION" type="xs:integer"/>
@@ -59,8 +57,6 @@
   <xs:element name="MAX_GPUS_PER_NODE" type="AttrElement"/>
   <xs:element name="MAX_MPITASKS_PER_NODE" type="AttrElement"/>
   <xs:element name="MAX_CPUTASKS_PER_GPU_NODE" type="AttrElement"/>
-  <xs:element name="GPU_TYPE" type="AttrElement"/>
-  <xs:element name="GPU_OFFLOAD" type="AttrElement"/>
   <xs:element name="MPI_GPU_WRAPPER_SCRIPT" type="AttrElement"/>
   <xs:element name="COSTPES_PER_NODE" type="xs:integer"/>
   <xs:element name="PROJECT_REQUIRED" type="xs:NCName"/>
@@ -175,10 +171,6 @@
         <!-- MAX_CPUTASKS_PER_GPU_NODE: number of physical PES per GPU node on
              this machine, in practice the MPI tasks per node will not exceed this value -->
         <xs:element ref="MAX_CPUTASKS_PER_GPU_NODE" minOccurs="0" maxOccurs="unbounded"/>
-	<!-- GPU_TYPE: the type of GPU hardware available on this machine -->
-        <xs:element ref="GPU_TYPE" minOccurs="0" maxOccurs="unbounded"/>
-	<!-- GPU_OFFLOAD: the GPU programming model used for GPU porting -->
-        <xs:element ref="GPU_OFFLOAD" minOccurs="0" maxOccurs="unbounded"/>
 	<!-- MPI_GPU_WRAPPER_SCRIPT: a wrapper script that will be attached to the MPI run
 	     command and map different MPI ranks to different GPUs within the same node -->
         <xs:element ref="MPI_GPU_WRAPPER_SCRIPT" minOccurs="0" maxOccurs="1"/>
@@ -265,8 +257,6 @@
       <xs:attribute ref="PIO_VERSION"/>
       <xs:attribute ref="mpilib"/>
       <xs:attribute ref="comp_interface"/>
-      <xs:attribute ref="gpu_offload"/>
-      <xs:attribute ref="gpu_type"/>
     </xs:complexType>
   </xs:element>
   <xs:element name="command">
diff --git a/CIME/data/config/xml_schemas/env_mach_specific.xsd b/CIME/data/config/xml_schemas/env_mach_specific.xsd
index 3c7a3a0d679..529598077d0 100644
--- a/CIME/data/config/xml_schemas/env_mach_specific.xsd
+++ b/CIME/data/config/xml_schemas/env_mach_specific.xsd
@@ -9,8 +9,6 @@
 <xs:attribute name="PIO_VERSION" type="xs:integer"/>
 <xs:attribute name="mpilib" type="xs:string"/>
 <xs:attribute name="comp_interface" type="xs:string"/>
-<xs:attribute name="gpu_type" type="xs:string"/>
-<xs:attribute name="gpu_offload" type="xs:string"/>
 <xs:attribute name="BUILD_THREADED" type="xs:string"/>
 <xs:attribute name="value" type="xs:string"/>
 <xs:attribute name="unit_testing" type="xs:boolean"/>
@@ -104,8 +102,6 @@
       <xs:attribute ref="PIO_VERSION" />
       <xs:attribute ref="mpilib"/>
       <xs:attribute ref="comp_interface"/>
-      <xs:attribute ref="gpu_type"/>
-      <xs:attribute ref="gpu_offload"/>
     </xs:complexType>
   </xs:element>
 
diff --git a/CIME/scripts/create_newcase.py b/CIME/scripts/create_newcase.py
index 1e7b33ea315..879a5d167c8 100755
--- a/CIME/scripts/create_newcase.py
+++ b/CIME/scripts/create_newcase.py
@@ -264,25 +264,6 @@ def parse_command_line(args, cimeroot, description):
 
     parser.add_argument("--case-group", help="Add this case to a case group")
 
-    parser.add_argument(
-        "--ngpus-per-node",
-        default=0,
-        type=int,
-        help="Specify number of GPUs used for simulation. ",
-    )
-
-    parser.add_argument(
-        "--gpu-type",
-        default=None,
-        help="Specify type of GPU hardware - currently supported are v100, a100, mi250",
-    )
-
-    parser.add_argument(
-        "--gpu-offload",
-        default=None,
-        help="Specify gpu offload method - currently supported are openacc, openmp, combined",
-    )
-
     args = CIME.utils.parse_args_and_handle_standard_logging_options(args, parser)
 
     if args.srcroot is not None:
@@ -358,9 +339,6 @@ def parse_command_line(args, cimeroot, description):
         args.non_local,
         args.extra_machines_dir,
         args.case_group,
-        args.ngpus_per_node,
-        args.gpu_type,
-        args.gpu_offload,
     )
 
 
@@ -397,9 +375,6 @@ def _main_func(description=None):
         non_local,
         extra_machines_dir,
         case_group,
-        ngpus_per_node,
-        gpu_type,
-        gpu_offload,
     ) = parse_command_line(sys.argv, cimeroot, description)
 
     if script_root is None:
@@ -464,9 +439,6 @@ def _main_func(description=None):
             non_local=non_local,
             extra_machines_dir=extra_machines_dir,
             case_group=case_group,
-            ngpus_per_node=ngpus_per_node,
-            gpu_type=gpu_type,
-            gpu_offload=gpu_offload,
         )
 
         # Called after create since casedir does not exist yet
diff --git a/CIME/test_scheduler.py b/CIME/test_scheduler.py
index da99dacb69b..179402c7501 100644
--- a/CIME/test_scheduler.py
+++ b/CIME/test_scheduler.py
@@ -670,18 +670,6 @@ def _create_newcase_phase(self, test):
                 elif case_opt.startswith("P"):
                     pesize = case_opt[1:]
                     create_newcase_cmd += " --pecount {}".format(pesize)
-                elif case_opt.startswith("G"):
-                    if "-" in case_opt:
-                        ngpus_per_node, gpu_type, gpu_offload = case_opt[1:].split("-")
-                    else:
-                        error = "GPU test argument format is ngpus_per_node-gpu_type-gpu_offload"
-                        self._log_output(test, error)
-                        return False, error
-                    create_newcase_cmd += (
-                        " --ngpus-per-node {} --gpu-type {} --gpu-offload {}".format(
-                            ngpus_per_node, gpu_type, gpu_offload
-                        )
-                    )
                 elif case_opt.startswith("V"):
                     driver = case_opt[1:]
 
diff --git a/CIME/tests/test_unit_case.py b/CIME/tests/test_unit_case.py
index abc2acff8ee..520e2d9e4bb 100755
--- a/CIME/tests/test_unit_case.py
+++ b/CIME/tests/test_unit_case.py
@@ -253,9 +253,6 @@ def test_copy(
                     non_local=False,
                     extra_machines_dir=None,
                     case_group=None,
-                    ngpus_per_node=0,
-                    gpu_type=None,
-                    gpu_offload=None,
                 )
                 create_caseroot.assert_called()
                 apply_user_mods.assert_called()
@@ -330,9 +327,6 @@ def test_create(
                     non_local=False,
                     extra_machines_dir=None,
                     case_group=None,
-                    ngpus_per_node=0,
-                    gpu_type=None,
-                    gpu_offload=None,
                 )
                 create_caseroot.assert_called()
                 apply_user_mods.assert_called()

From ed43349cb2ff5a73826668b48ad4c81da69e9a8a Mon Sep 17 00:00:00 2001
From: Jian Sun <sunjian@ucar.edu>
Date: Thu, 3 Oct 2024 13:34:48 -0600
Subject: [PATCH 02/12] fix XML scheme issue for machine attribute

---
 CIME/data/config/xml_schemas/entry_id_base.xsd | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/CIME/data/config/xml_schemas/entry_id_base.xsd b/CIME/data/config/xml_schemas/entry_id_base.xsd
index b0a060997a9..2aae23762f6 100644
--- a/CIME/data/config/xml_schemas/entry_id_base.xsd
+++ b/CIME/data/config/xml_schemas/entry_id_base.xsd
@@ -13,7 +13,6 @@
   <!-- simple elements -->
   <xs:element name="help" type="xs:string"/>
   <xs:element name="default_value" type="xs:string"/>
-  <xs:element name="valid_values" type="xs:string"/>
   <xs:element name="category" type="xs:string"/>
   <xs:element name="header" type="xs:string"/>
 
@@ -28,6 +27,16 @@
     </xs:complexType>
   </xs:element>
 
+  <xs:element name="valid_values">
+    <xs:complexType>
+      <xs:simpleContent>
+	<xs:extension base="xs:string">
+	  <xs:anyAttribute processContents="lax"/>
+	</xs:extension>
+      </xs:simpleContent>
+    </xs:complexType>
+  </xs:element>
+
   <xs:element name="desc">
     <xs:complexType mixed="true">
       <xs:attribute ref="compset"/>

From 4fce782f110ffce3c0ab694f636aa785dab6ac61 Mon Sep 17 00:00:00 2001
From: Jian Sun <sunjian@ucar.edu>
Date: Thu, 3 Oct 2024 22:01:12 -0600
Subject: [PATCH 03/12] update gpu option checks

---
 CIME/build.py     | 15 +++------------
 CIME/case/case.py | 12 +++++++-----
 2 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/CIME/build.py b/CIME/build.py
index 2d6c6414ffa..01f2a702b08 100644
--- a/CIME/build.py
+++ b/CIME/build.py
@@ -247,18 +247,9 @@ def get_standard_cmake_args(case, sharedpath):
     )
     # check settings for GPU
     gpu_type = case.get_value("GPU_TYPE")
-    gpu_offload = case.get_value("GPU_OFFLOAD")
-    if gpu_type != "none":
-        expect(
-            gpu_offload != "none",
-            "Both GPU_TYPE and GPU_OFFLOAD must be defined if either is",
-        )
-        cmake_args += f" -DGPU_TYPE={gpu_type} -DGPU_OFFLOAD={gpu_offload}"
-    else:
-        expect(
-            gpu_offload == "none",
-            "Both GPU_TYPE and GPU_OFFLOAD must be defined if either is",
-        )
+    openacc_gpu_offload = case.get_value("OPENACC_GPU_OFFLOAD")
+    openmp_gpu_offload = case.get_value("OPENMP_GPU_OFFLOAD")
+    cmake_args += f" -DGPU_TYPE={gpu_type} -DOPENACC_GPU_OFFLOAD={openacc_gpu_offload} -DOPENMP_GPU_OFFLOAD={openmp_gpu_offload} "
 
     ocn_model = case.get_value("COMP_OCN")
     atm_dycore = case.get_value("CAM_DYCORE")
diff --git a/CIME/case/case.py b/CIME/case/case.py
index 82d5baacdc3..537ba4e18e8 100644
--- a/CIME/case/case.py
+++ b/CIME/case/case.py
@@ -1567,16 +1567,18 @@ def configure(
         # ----------------------------------------------------------------------------------------------------------
         max_gpus_per_node = self.get_value("MAX_GPUS_PER_NODE")
         gpu_type = self.get_value("GPU_TYPE")
-        gpu_offload = self.get_value("GPU_OFFLOAD")
+        openacc_gpu_offload = self.get_value("OPENACC_GPU_OFFLOAD")
+        openmp_gpu_offload = self.get_value("OPENMP_GPU_OFFLOAD")
+        gpu_offload = (openacc_gpu_offload or openmp_gpu_offload)
         ngpus_per_node = self.get_value("NGPUS_PER_NODE")
         if gpu_type and str(gpu_type).lower() != "none":
             expect(
                 max_gpus_per_node,
-                f"GPUS are not defined for machine={machine_name} and compiler={compiler}",
+                f"MAX_GPUS_PER_NODE is not defined for machine={machine_name} and compiler={compiler}",
             )
             expect(
                 gpu_offload,
-                "Both gpu-type and gpu-offload must be defined if either is defined",
+                "GPU_TYPE is defined but none of the GPU OFFLOAD options are enabled",
             )
             expect(
                 compiler in ["gnu", "nvhpc", "cray"],
@@ -1590,10 +1592,10 @@ def configure(
                     if ngpus_per_node <= max_gpus_per_node
                     else max_gpus_per_node,
                 )
-        elif gpu_offload and str(gpu_offload).lower() != "none":
+        elif gpu_offload:
             expect(
                 False,
-                "Both gpu-type and gpu-offload must be defined if either is defined",
+                "GPU_TYPE is not defined but at least one GPU OFFLOAD option is enabled",
             )
         elif ngpus_per_node != 0:
             expect(

From ace9c2a97529a7db36c5e517b4d5f3f23fdcaf03 Mon Sep 17 00:00:00 2001
From: Jian Sun <sunjian@ucar.edu>
Date: Thu, 3 Oct 2024 22:36:13 -0600
Subject: [PATCH 04/12] update the GPU job resources for a SCREAM GPU run

---
 CIME/XML/env_mach_pes.py | 4 +++-
 CIME/case/case.py        | 9 +++------
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/CIME/XML/env_mach_pes.py b/CIME/XML/env_mach_pes.py
index 61402aa9686..1e244402fcf 100644
--- a/CIME/XML/env_mach_pes.py
+++ b/CIME/XML/env_mach_pes.py
@@ -182,6 +182,7 @@ def get_tasks_per_node(self, total_tasks, max_thread_count):
                     tasks_per_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
                 else:
                     tasks_per_node = self.get_value("NGPUS_PER_NODE")
+                    self.set_value("MAX_CPUTASKS_PER_GPU_NODE", tasks_per_node)
             else:
                 tasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
         else:
@@ -198,7 +199,8 @@ def get_tasks_per_node(self, total_tasks, max_thread_count):
                         self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
                         self.get_value("NGPUS_PER_NODE"),
                         total_tasks,
-                    ) 
+                    )
+                    self.set_value("MAX_CPUTASKS_PER_GPU_NODE", tasks_per_node)
             else:
                 tasks_per_node = min(
                     self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
diff --git a/CIME/case/case.py b/CIME/case/case.py
index 537ba4e18e8..a3688b7c171 100644
--- a/CIME/case/case.py
+++ b/CIME/case/case.py
@@ -1569,9 +1569,10 @@ def configure(
         gpu_type = self.get_value("GPU_TYPE")
         openacc_gpu_offload = self.get_value("OPENACC_GPU_OFFLOAD")
         openmp_gpu_offload = self.get_value("OPENMP_GPU_OFFLOAD")
-        gpu_offload = (openacc_gpu_offload or openmp_gpu_offload)
+        kokkos_gpu_offload = self.get_value("KOKKOS_GPU_OFFLOAD")
+        gpu_offload = (openacc_gpu_offload or openmp_gpu_offload or kokkos_gpu_offload)
         ngpus_per_node = self.get_value("NGPUS_PER_NODE")
-        if gpu_type and str(gpu_type).lower() != "none":
+        if str(gpu_type).lower() != "none":
             expect(
                 max_gpus_per_node,
                 f"MAX_GPUS_PER_NODE is not defined for machine={machine_name} and compiler={compiler}",
@@ -1580,10 +1581,6 @@ def configure(
                 gpu_offload,
                 "GPU_TYPE is defined but none of the GPU OFFLOAD options are enabled",
             )
-            expect(
-                compiler in ["gnu", "nvhpc", "cray"],
-                f"Only gnu, nvhpc and cray compilers are expected for a GPU run; the user given compiler is {compiler}, ",
-            )
             self.gpu_enabled = True
             if ngpus_per_node >= 0:
                 self.set_value(

From 8157c8b5883d42222b27fad557c7fe771d6caf25 Mon Sep 17 00:00:00 2001
From: Jian Sun <sunjian@ucar.edu>
Date: Fri, 4 Oct 2024 00:14:12 -0600
Subject: [PATCH 05/12] check the valid GPU setups at the correct place

---
 CIME/case/case.py       | 42 -----------------------------------------
 CIME/case/case_setup.py | 32 +++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+), 42 deletions(-)

diff --git a/CIME/case/case.py b/CIME/case/case.py
index a3688b7c171..455a6877d6b 100644
--- a/CIME/case/case.py
+++ b/CIME/case/case.py
@@ -1558,48 +1558,6 @@ def configure(
         if test:
             self.set_value("TEST", True)
 
-        # ----------------------------------------------------------------------------------------------------------
-        # Sanity check for a GPU run:
-        #        1. GPU_TYPE and GPU_OFFLOAD must both be defined to use GPUs
-        #        2. If the NGPUS_PER_NODE XML variable in the env_mach_pes.xml file is larger than 
-        #           the value of MAX_GPUS_PER_NODE, set it to MAX_GPUS_PER_NODE automatically.
-        #        3. If the NGPUS_PER_NODE XML variable is equal to 0, it will be updated to 1 automatically.
-        # ----------------------------------------------------------------------------------------------------------
-        max_gpus_per_node = self.get_value("MAX_GPUS_PER_NODE")
-        gpu_type = self.get_value("GPU_TYPE")
-        openacc_gpu_offload = self.get_value("OPENACC_GPU_OFFLOAD")
-        openmp_gpu_offload = self.get_value("OPENMP_GPU_OFFLOAD")
-        kokkos_gpu_offload = self.get_value("KOKKOS_GPU_OFFLOAD")
-        gpu_offload = (openacc_gpu_offload or openmp_gpu_offload or kokkos_gpu_offload)
-        ngpus_per_node = self.get_value("NGPUS_PER_NODE")
-        if str(gpu_type).lower() != "none":
-            expect(
-                max_gpus_per_node,
-                f"MAX_GPUS_PER_NODE is not defined for machine={machine_name} and compiler={compiler}",
-            )
-            expect(
-                gpu_offload,
-                "GPU_TYPE is defined but none of the GPU OFFLOAD options are enabled",
-            )
-            self.gpu_enabled = True
-            if ngpus_per_node >= 0:
-                self.set_value(
-                    "NGPUS_PER_NODE",
-                    max(1, ngpus_per_node)
-                    if ngpus_per_node <= max_gpus_per_node
-                    else max_gpus_per_node,
-                )
-        elif gpu_offload:
-            expect(
-                False,
-                "GPU_TYPE is not defined but at least one GPU OFFLOAD option is enabled",
-            )
-        elif ngpus_per_node != 0:
-            expect(
-                False,
-                f"ngpus_per_node is expected to be 0 for a pure CPU run ; {ngpus_per_node} is provided instead ;",
-            )
-
         self.initialize_derived_attributes()
 
         # --------------------------------------------
diff --git a/CIME/case/case_setup.py b/CIME/case/case_setup.py
index 1679daf8994..58c5798abbf 100644
--- a/CIME/case/case_setup.py
+++ b/CIME/case/case_setup.py
@@ -389,6 +389,38 @@ def _case_setup_impl(
                     + case.iotasks,
                 )
 
+            # ----------------------------------------------------------------------------------------------------------
+            # Sanity check for a GPU run:
+            #        1. GPU_TYPE and GPU_OFFLOAD must both be defined to use GPUs
+            #        2. If the NGPUS_PER_NODE XML variable in the env_mach_pes.xml file is larger than 
+            #           the value of MAX_GPUS_PER_NODE, set it to MAX_GPUS_PER_NODE automatically.
+            #        3. If the NGPUS_PER_NODE XML variable is equal to 0, it will be updated to 1 automatically.
+            # ----------------------------------------------------------------------------------------------------------
+            max_gpus_per_node = case.get_value("MAX_GPUS_PER_NODE")
+            gpu_type = case.get_value("GPU_TYPE")
+            openacc_gpu_offload = case.get_value("OPENACC_GPU_OFFLOAD")
+            openmp_gpu_offload = case.get_value("OPENMP_GPU_OFFLOAD")
+            kokkos_gpu_offload = case.get_value("KOKKOS_GPU_OFFLOAD")
+            gpu_offload = (openacc_gpu_offload or openmp_gpu_offload or kokkos_gpu_offload)
+            ngpus_per_node = case.get_value("NGPUS_PER_NODE")
+            if str(gpu_type).lower() != "none":
+                if max_gpus_per_node <= 0:
+                    raise RuntimeError(f"MAX_GPUS_PER_NODE must be larger than 0 for machine={mach} and compiler={compiler} in order to configure a GPU run")
+                if not gpu_offload:
+                    raise RuntimeError(f"GPU_TYPE is defined but none of the GPU OFFLOAD options are enabled")
+                case.gpu_enabled = True
+                if ngpus_per_node >= 0:
+                    case.set_value(
+                        "NGPUS_PER_NODE",
+                        max(1, ngpus_per_node)
+                        if ngpus_per_node <= max_gpus_per_node
+                        else max_gpus_per_node,
+                    )
+            elif gpu_offload:
+                raise RuntimeError(f"GPU_TYPE is not defined but at least one GPU OFFLOAD option is enabled")
+            elif ngpus_per_node != 0:
+                raise RuntimeError(f"ngpus_per_node is expected to be 0 for a pure CPU run ; {ngpus_per_node} is provided instead ;")
+
             # May need to select new batch settings if pelayout changed (e.g. problem is now too big for prev-selected queue)
             env_batch = case.get_env("batch")
             env_batch.set_job_defaults([(case.get_primary_job(), {})], case)

From 3864cf7f65f0284e630977bf190d69a91be455c9 Mon Sep 17 00:00:00 2001
From: Jian Sun <sunjian@ucar.edu>
Date: Fri, 4 Oct 2024 09:50:34 -0600
Subject: [PATCH 06/12] fix missing option and comma

---
 CIME/XML/env_mach_pes.py | 2 +-
 CIME/build.py            | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/CIME/XML/env_mach_pes.py b/CIME/XML/env_mach_pes.py
index 1e244402fcf..ffa0d217937 100644
--- a/CIME/XML/env_mach_pes.py
+++ b/CIME/XML/env_mach_pes.py
@@ -44,7 +44,7 @@ def get_value(
         max_mpitasks_per_node=None,
         max_cputasks_per_gpu_node=None,
         ngpus_per_node=None,
-        oversubscribe_gpu=False
+        oversubscribe_gpu=False,
     ):  # pylint: disable=arguments-differ
         # Special variable NINST_MAX is used to determine the number of
         # drivers in multi-driver mode.
diff --git a/CIME/build.py b/CIME/build.py
index 01f2a702b08..b84b4bf5f7f 100644
--- a/CIME/build.py
+++ b/CIME/build.py
@@ -249,7 +249,8 @@ def get_standard_cmake_args(case, sharedpath):
     gpu_type = case.get_value("GPU_TYPE")
     openacc_gpu_offload = case.get_value("OPENACC_GPU_OFFLOAD")
     openmp_gpu_offload = case.get_value("OPENMP_GPU_OFFLOAD")
-    cmake_args += f" -DGPU_TYPE={gpu_type} -DOPENACC_GPU_OFFLOAD={openacc_gpu_offload} -DOPENMP_GPU_OFFLOAD={openmp_gpu_offload} "
+    kokkos_gpu_offload = case.get_value("KOKKOS_GPU_OFFLOAD")
+    cmake_args += f" -DGPU_TYPE={gpu_type} -DOPENACC_GPU_OFFLOAD={openacc_gpu_offload} -DOPENMP_GPU_OFFLOAD={openmp_gpu_offload} -DKOKKOS_GPU_OFFLOAD={kokkos_gpu_offload} "
 
     ocn_model = case.get_value("COMP_OCN")
     atm_dycore = case.get_value("CAM_DYCORE")

From 0680eaf00495bf5aa3efff2729490237a740a754 Mon Sep 17 00:00:00 2001
From: Jian Sun <sunjian@ucar.edu>
Date: Fri, 4 Oct 2024 10:14:19 -0600
Subject: [PATCH 07/12] more bug fixes and allow some GPU varibles undefined

---
 CIME/XML/env_mach_pes.py                           | 3 ++-
 CIME/case/case_setup.py                            | 4 ++--
 CIME/data/config/xml_schemas/config_machines.xsd   | 2 ++
 CIME/data/config/xml_schemas/env_mach_specific.xsd | 2 ++
 4 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/CIME/XML/env_mach_pes.py b/CIME/XML/env_mach_pes.py
index ffa0d217937..14cb803da42 100644
--- a/CIME/XML/env_mach_pes.py
+++ b/CIME/XML/env_mach_pes.py
@@ -177,7 +177,8 @@ def get_tasks_per_node(self, total_tasks, max_thread_count):
             "totaltasks > 0 expected, totaltasks = {}".format(total_tasks),
         )
         if self._comp_interface == "nuopc" and self.get_value("ESMF_AWARE_THREADING"):
-            if self.get_value("NGPUS_PER_NODE") > 0:
+            ngpus_per_node = self.get_value("NGPUS_PER_NODE")
+            if ngpus_per_node and ngpus_per_node > 0:
                 if self.get_value("OVERSUBSCRIBE_GPU"):
                     tasks_per_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
                 else:
diff --git a/CIME/case/case_setup.py b/CIME/case/case_setup.py
index 58c5798abbf..6d1a4ad250e 100644
--- a/CIME/case/case_setup.py
+++ b/CIME/case/case_setup.py
@@ -403,7 +403,7 @@ def _case_setup_impl(
             kokkos_gpu_offload = case.get_value("KOKKOS_GPU_OFFLOAD")
             gpu_offload = (openacc_gpu_offload or openmp_gpu_offload or kokkos_gpu_offload)
             ngpus_per_node = case.get_value("NGPUS_PER_NODE")
-            if str(gpu_type).lower() != "none":
+            if gpu_type and str(gpu_type).lower() != "none":
                 if max_gpus_per_node <= 0:
                     raise RuntimeError(f"MAX_GPUS_PER_NODE must be larger than 0 for machine={mach} and compiler={compiler} in order to configure a GPU run")
                 if not gpu_offload:
@@ -418,7 +418,7 @@ def _case_setup_impl(
                     )
             elif gpu_offload:
                 raise RuntimeError(f"GPU_TYPE is not defined but at least one GPU OFFLOAD option is enabled")
-            elif ngpus_per_node != 0:
+            elif ngpus_per_node and ngpus_per_node != 0:
                 raise RuntimeError(f"ngpus_per_node is expected to be 0 for a pure CPU run ; {ngpus_per_node} is provided instead ;")
 
             # May need to select new batch settings if pelayout changed (e.g. problem is now too big for prev-selected queue)
diff --git a/CIME/data/config/xml_schemas/config_machines.xsd b/CIME/data/config/xml_schemas/config_machines.xsd
index 70f6f155d1e..6be3efd952e 100644
--- a/CIME/data/config/xml_schemas/config_machines.xsd
+++ b/CIME/data/config/xml_schemas/config_machines.xsd
@@ -6,6 +6,7 @@
   <xs:attribute name="compiler" type="xs:string"/>
   <xs:attribute name="mpilib" type="xs:string"/>
   <xs:attribute name="comp_interface" type="xs:string"/>
+  <xs:attribute name="gpu_type" type="xs:string"/>
   <xs:attribute name="queue" type="xs:string"/>
   <xs:attribute name="DEBUG" type="upperBoolean"/>
   <xs:attribute name="PIO_VERSION" type="xs:integer"/>
@@ -257,6 +258,7 @@
       <xs:attribute ref="PIO_VERSION"/>
       <xs:attribute ref="mpilib"/>
       <xs:attribute ref="comp_interface"/>
+      <xs:attribute ref="gpu_type"/>
     </xs:complexType>
   </xs:element>
   <xs:element name="command">
diff --git a/CIME/data/config/xml_schemas/env_mach_specific.xsd b/CIME/data/config/xml_schemas/env_mach_specific.xsd
index 529598077d0..77020e8e0f1 100644
--- a/CIME/data/config/xml_schemas/env_mach_specific.xsd
+++ b/CIME/data/config/xml_schemas/env_mach_specific.xsd
@@ -9,6 +9,7 @@
 <xs:attribute name="PIO_VERSION" type="xs:integer"/>
 <xs:attribute name="mpilib" type="xs:string"/>
 <xs:attribute name="comp_interface" type="xs:string"/>
+<xs:attribute name="gpu_type" type="xs:string"/>
 <xs:attribute name="BUILD_THREADED" type="xs:string"/>
 <xs:attribute name="value" type="xs:string"/>
 <xs:attribute name="unit_testing" type="xs:boolean"/>
@@ -102,6 +103,7 @@
       <xs:attribute ref="PIO_VERSION" />
       <xs:attribute ref="mpilib"/>
       <xs:attribute ref="comp_interface"/>
+      <xs:attribute ref="gpu_type"/>
     </xs:complexType>
   </xs:element>
 

From 130c75470a666f7519c65881960c82b983bedc7d Mon Sep 17 00:00:00 2001
From: Jian Sun <sunjian@ucar.edu>
Date: Fri, 4 Oct 2024 20:42:16 -0600
Subject: [PATCH 08/12] bug fix for calculating job resource

---
 CIME/XML/env_mach_pes.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/CIME/XML/env_mach_pes.py b/CIME/XML/env_mach_pes.py
index 14cb803da42..6962dd282a1 100644
--- a/CIME/XML/env_mach_pes.py
+++ b/CIME/XML/env_mach_pes.py
@@ -183,7 +183,6 @@ def get_tasks_per_node(self, total_tasks, max_thread_count):
                     tasks_per_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
                 else:
                     tasks_per_node = self.get_value("NGPUS_PER_NODE")
-                    self.set_value("MAX_CPUTASKS_PER_GPU_NODE", tasks_per_node)
             else:
                 tasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
         else:
@@ -201,7 +200,6 @@ def get_tasks_per_node(self, total_tasks, max_thread_count):
                         self.get_value("NGPUS_PER_NODE"),
                         total_tasks,
                     )
-                    self.set_value("MAX_CPUTASKS_PER_GPU_NODE", tasks_per_node)
             else:
                 tasks_per_node = min(
                     self.get_value("MAX_TASKS_PER_NODE") // max_thread_count,
@@ -218,7 +216,12 @@ def get_total_nodes(self, total_tasks, max_thread_count):
         if self._comp_interface == "nuopc" and self.get_value("ESMF_AWARE_THREADING"):
             max_thread_count = 1
         tasks_per_node = self.get_tasks_per_node(total_tasks, max_thread_count)
-        num_nodes = int(math.ceil(float(total_tasks) / tasks_per_node))
+        if self.get_value("OVERSUBSCRIBE_GPU"):
+            num_nodes = int(math.ceil(float(total_tasks) / tasks_per_node))
+        else:
+            ngpus_per_node = self.get_value("NGPUS_PER_NODE")
+            if ngpus_per_node and ngpus_per_node > 0:
+                num_nodes = int(math.ceil(float(total_tasks) / ngpus_per_node))
         return num_nodes, self.get_spare_nodes(num_nodes)
 
     def get_spare_nodes(self, num_nodes):

From 0b307d8a78ba40d2767c73837185dff7c6c0c946 Mon Sep 17 00:00:00 2001
From: Jian Sun <sunjian@ucar.edu>
Date: Fri, 4 Oct 2024 20:51:17 -0600
Subject: [PATCH 09/12] more bug fix

---
 CIME/XML/env_mach_pes.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CIME/XML/env_mach_pes.py b/CIME/XML/env_mach_pes.py
index 6962dd282a1..8e435329266 100644
--- a/CIME/XML/env_mach_pes.py
+++ b/CIME/XML/env_mach_pes.py
@@ -222,6 +222,8 @@ def get_total_nodes(self, total_tasks, max_thread_count):
             ngpus_per_node = self.get_value("NGPUS_PER_NODE")
             if ngpus_per_node and ngpus_per_node > 0:
                 num_nodes = int(math.ceil(float(total_tasks) / ngpus_per_node))
+            else:
+                num_nodes = int(math.ceil(float(total_tasks) / tasks_per_node))
         return num_nodes, self.get_spare_nodes(num_nodes)
 
     def get_spare_nodes(self, num_nodes):

From 0d7ce8bb9a19f5e93767dfa9f2c9abf37b706d9c Mon Sep 17 00:00:00 2001
From: Jian Sun <sunjian@ucar.edu>
Date: Sat, 5 Oct 2024 19:37:27 -0600
Subject: [PATCH 10/12] remove unused arguments and address pylint error

---
 CIME/XML/env_mach_pes.py | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/CIME/XML/env_mach_pes.py b/CIME/XML/env_mach_pes.py
index 8e435329266..3dba2c95ecb 100644
--- a/CIME/XML/env_mach_pes.py
+++ b/CIME/XML/env_mach_pes.py
@@ -41,10 +41,6 @@ def get_value(
         attribute=None,
         resolved=True,
         subgroup=None,
-        max_mpitasks_per_node=None,
-        max_cputasks_per_gpu_node=None,
-        ngpus_per_node=None,
-        oversubscribe_gpu=False,
     ):  # pylint: disable=arguments-differ
         # Special variable NINST_MAX is used to determine the number of
         # drivers in multi-driver mode.
@@ -59,12 +55,9 @@ def get_value(
         value = EnvBase.get_value(self, vid, attribute, resolved, subgroup)
 
         if "NTASKS" in vid or "ROOTPE" in vid:
-            if max_mpitasks_per_node is None:
-                max_mpitasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
-            if max_cputasks_per_gpu_node is None:
-                max_cputasks_per_gpu_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
-            if ngpus_per_node is None:
-                ngpus_per_node = self.get_value("NGPUS_PER_NODE")
+            max_mpitasks_per_node = self.get_value("MAX_MPITASKS_PER_NODE")
+            max_cputasks_per_gpu_node = self.get_value("MAX_CPUTASKS_PER_GPU_NODE")
+            ngpus_per_node = self.get_value("NGPUS_PER_NODE")
             if (ngpus_per_node and value) and value < 0:
                 value = -1 * value * max_cputasks_per_gpu_node
             elif value and value < 0:

From 40d8c5d5873c3e1a22250df1e7c67f202c887550 Mon Sep 17 00:00:00 2001
From: Jian Sun <sunjian@ucar.edu>
Date: Sun, 6 Oct 2024 08:20:06 -0600
Subject: [PATCH 11/12] more pylint error fix

---
 CIME/case/case_setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CIME/case/case_setup.py b/CIME/case/case_setup.py
index 6d1a4ad250e..c6a35779832 100644
--- a/CIME/case/case_setup.py
+++ b/CIME/case/case_setup.py
@@ -407,7 +407,7 @@ def _case_setup_impl(
                 if max_gpus_per_node <= 0:
                     raise RuntimeError(f"MAX_GPUS_PER_NODE must be larger than 0 for machine={mach} and compiler={compiler} in order to configure a GPU run")
                 if not gpu_offload:
-                    raise RuntimeError(f"GPU_TYPE is defined but none of the GPU OFFLOAD options are enabled")
+                    raise RuntimeError("GPU_TYPE is defined but none of the GPU OFFLOAD options are enabled")
                 case.gpu_enabled = True
                 if ngpus_per_node >= 0:
                     case.set_value(
@@ -417,7 +417,7 @@ def _case_setup_impl(
                         else max_gpus_per_node,
                     )
             elif gpu_offload:
-                raise RuntimeError(f"GPU_TYPE is not defined but at least one GPU OFFLOAD option is enabled")
+                raise RuntimeError("GPU_TYPE is not defined but at least one GPU OFFLOAD option is enabled")
             elif ngpus_per_node and ngpus_per_node != 0:
                 raise RuntimeError(f"ngpus_per_node is expected to be 0 for a pure CPU run ; {ngpus_per_node} is provided instead ;")
 

From 246eea7434837147de5bb66aedc4f7c5d0eeb4ad Mon Sep 17 00:00:00 2001
From: Jian Sun <sunjian@ucar.edu>
Date: Mon, 7 Oct 2024 10:22:39 -0600
Subject: [PATCH 12/12] try to fix the pre-commit error

---
 CIME/case/case_setup.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/CIME/case/case_setup.py b/CIME/case/case_setup.py
index c6a35779832..6ca33d6cabc 100644
--- a/CIME/case/case_setup.py
+++ b/CIME/case/case_setup.py
@@ -392,7 +392,7 @@ def _case_setup_impl(
             # ----------------------------------------------------------------------------------------------------------
             # Sanity check for a GPU run:
             #        1. GPU_TYPE and GPU_OFFLOAD must both be defined to use GPUs
-            #        2. If the NGPUS_PER_NODE XML variable in the env_mach_pes.xml file is larger than 
+            #        2. If the NGPUS_PER_NODE XML variable in the env_mach_pes.xml file is larger than
             #           the value of MAX_GPUS_PER_NODE, set it to MAX_GPUS_PER_NODE automatically.
             #        3. If the NGPUS_PER_NODE XML variable is equal to 0, it will be updated to 1 automatically.
             # ----------------------------------------------------------------------------------------------------------
@@ -401,13 +401,19 @@ def _case_setup_impl(
             openacc_gpu_offload = case.get_value("OPENACC_GPU_OFFLOAD")
             openmp_gpu_offload = case.get_value("OPENMP_GPU_OFFLOAD")
             kokkos_gpu_offload = case.get_value("KOKKOS_GPU_OFFLOAD")
-            gpu_offload = (openacc_gpu_offload or openmp_gpu_offload or kokkos_gpu_offload)
+            gpu_offload = (
+                openacc_gpu_offload or openmp_gpu_offload or kokkos_gpu_offload
+            )
             ngpus_per_node = case.get_value("NGPUS_PER_NODE")
             if gpu_type and str(gpu_type).lower() != "none":
                 if max_gpus_per_node <= 0:
-                    raise RuntimeError(f"MAX_GPUS_PER_NODE must be larger than 0 for machine={mach} and compiler={compiler} in order to configure a GPU run")
+                    raise RuntimeError(
+                        f"MAX_GPUS_PER_NODE must be larger than 0 for machine={mach} and compiler={compiler} in order to configure a GPU run"
+                    )
                 if not gpu_offload:
-                    raise RuntimeError("GPU_TYPE is defined but none of the GPU OFFLOAD options are enabled")
+                    raise RuntimeError(
+                        "GPU_TYPE is defined but none of the GPU OFFLOAD options are enabled"
+                    )
                 case.gpu_enabled = True
                 if ngpus_per_node >= 0:
                     case.set_value(
@@ -417,9 +423,13 @@ def _case_setup_impl(
                         else max_gpus_per_node,
                     )
             elif gpu_offload:
-                raise RuntimeError("GPU_TYPE is not defined but at least one GPU OFFLOAD option is enabled")
+                raise RuntimeError(
+                    "GPU_TYPE is not defined but at least one GPU OFFLOAD option is enabled"
+                )
             elif ngpus_per_node and ngpus_per_node != 0:
-                raise RuntimeError(f"ngpus_per_node is expected to be 0 for a pure CPU run ; {ngpus_per_node} is provided instead ;")
+                raise RuntimeError(
+                    f"ngpus_per_node is expected to be 0 for a pure CPU run ; {ngpus_per_node} is provided instead ;"
+                )
 
             # May need to select new batch settings if pelayout changed (e.g. problem is now too big for prev-selected queue)
             env_batch = case.get_env("batch")
@@ -559,7 +569,6 @@ def case_setup(self, clean=False, test_mode=False, reset=False, keep=None):
 
 
 def _create_case_repo(self, caseroot):
-
     self._gitinterface = GitInterface(caseroot, logger, branch=self.get_value("CASE"))
     if self._gitinterface and not os.path.exists(os.path.join(caseroot, ".gitignore")):
         safe_copy(