From 5dcf11a9739c0e41ff53515685198ca8ec75cc11 Mon Sep 17 00:00:00 2001
From: Edgar Gabriel <edgar.gabriel@amd.com>
Date: Fri, 5 Jul 2024 16:44:25 -0500
Subject: [PATCH 1/3] fcoll/vulcan: add support for GPU aggregation buffers

If the user user input buffers are GPU device memory, use also GPU device memory for the aggregation step. This will allow the data transfer to occur between GPU buffers, and hence take advantage of the much higher GPU-GPU interconnects (e.g. XGMI, NVLINK, etc.).

The downside of this approach is that we cannot call directly into the fbtl ipwritev routine, but have to go through the common_ompio_file_iwrite_pregen routine, which performs the necessary segmenting and staging through the host memory.

Signed-off-by: Edgar Gabriel <edgar.gabriel@amd.com>
---
 ompi/mca/common/ompio/common_ompio.h          |  2 +
 .../common/ompio/common_ompio_file_write.c    | 69 +++++++++++++++-
 ompi/mca/fcoll/vulcan/fcoll_vulcan.h          |  1 +
 .../vulcan/fcoll_vulcan_file_write_all.c      | 82 ++++++++++++++-----
 ompi/mca/io/ompio/io_ompio.c                  |  5 +-
 ompi/mca/io/ompio/io_ompio.h                  |  4 +-
 ompi/mca/io/ompio/io_ompio_component.c        | 12 ++-
 7 files changed, 149 insertions(+), 26 deletions(-)

diff --git a/ompi/mca/common/ompio/common_ompio.h b/ompi/mca/common/ompio/common_ompio.h
index f6261dff078..1ddb0b47d7f 100644
--- a/ompi/mca/common/ompio/common_ompio.h
+++ b/ompi/mca/common/ompio/common_ompio.h
@@ -262,6 +262,8 @@ OMPI_DECLSPEC int mca_common_ompio_file_write_at (ompio_file_t *fh, OMPI_MPI_OFF
 OMPI_DECLSPEC int mca_common_ompio_file_iwrite (ompio_file_t *fh, const void *buf, size_t count,
                                                 struct ompi_datatype_t *datatype, ompi_request_t **request);
 
+OMPI_DECLSPEC int mca_common_ompio_file_iwrite_pregen (ompio_file_t *fh, ompi_request_t *request);
+
 OMPI_DECLSPEC int mca_common_ompio_file_iwrite_at (ompio_file_t *fh,  OMPI_MPI_OFFSET_TYPE offset,
                                                    const void *buf,  size_t count,  struct ompi_datatype_t *datatype,
                                                    ompi_request_t **request);
diff --git a/ompi/mca/common/ompio/common_ompio_file_write.c b/ompi/mca/common/ompio/common_ompio_file_write.c
index b970346f7a2..b5d3900e7a3 100644
--- a/ompi/mca/common/ompio/common_ompio_file_write.c
+++ b/ompi/mca/common/ompio/common_ompio_file_write.c
@@ -12,7 +12,7 @@
  * Copyright (c) 2008-2019 University of Houston. All rights reserved.
  * Copyright (c) 2015-2018 Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
- * Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
  * Copyright (c) 2024      Triad National Security, LLC. All rights
  *                         reserved.
  * $COPYRIGHT$
@@ -329,6 +329,7 @@ static void mca_common_ompio_post_next_write_subreq(struct mca_ompio_request_t *
     decoded_iov.iov_base = req->req_tbuf;
     decoded_iov.iov_len  = req->req_size;
     opal_convertor_pack (&req->req_convertor, &decoded_iov, &iov_count, &pos);
+
     mca_common_ompio_build_io_array (req->req_fview, index, req->req_num_subreqs,
                                      bytes_per_cycle, pos,
                                      iov_count, &decoded_iov,
@@ -472,6 +473,72 @@ int mca_common_ompio_file_iwrite (ompio_file_t *fh,
     return ret;
 }
 
+/*
+** This routine is invoked from the fcoll component.
+** It is only used if the temporary buffer is a gpu buffer,
+** and the fbtl supports the ipwritev operation.
+**
+** The io-array has already been generated in fcoll/xxx/file_write_all,
+** and we use the pre-computed offsets to created a pseudo fview.
+** The position of the file pointer is updated in the fcoll
+** component, not here.
+*/
+
+int mca_common_ompio_file_iwrite_pregen (ompio_file_t *fh,
+					 ompi_request_t *request)
+{
+    uint32_t i;
+    size_t max_data;
+    size_t pipeline_buf_size;
+    mca_ompio_request_t *ompio_req = (mca_ompio_request_t *) request;
+
+    if (NULL == fh->f_fbtl->fbtl_ipwritev) {
+	return MPI_ERR_INTERN;
+    }
+
+    max_data = fh->f_io_array[0].length;
+    pipeline_buf_size = OMPIO_MCA_GET(fh, pipeline_buffer_size);
+
+    mca_common_ompio_register_progress ();
+
+    OMPIO_PREPARE_BUF (fh, fh->f_io_array[0].memory_address, max_data, MPI_BYTE,
+		       ompio_req->req_tbuf, &ompio_req->req_convertor, max_data,
+		       pipeline_buf_size, NULL, i);
+
+    ompio_req->req_num_subreqs = ceil((double)max_data/pipeline_buf_size);
+    ompio_req->req_size        = pipeline_buf_size;
+    ompio_req->req_max_data    = max_data;
+    ompio_req->req_post_next_subreq = mca_common_ompio_post_next_write_subreq;
+    ompio_req->req_fh          = fh;
+    ompio_req->req_ompi.req_status.MPI_ERROR = MPI_SUCCESS;
+
+    ompio_req->req_fview = (struct ompio_fview_t *) calloc(1, sizeof(struct ompio_fview_t));
+    if (NULL == ompio_req->req_fview) {
+	opal_output(1, "common_ompio: error allocating memory\n");
+	return OMPI_ERR_OUT_OF_RESOURCE;
+    }
+
+   ompio_req->req_fview->f_decoded_iov = (struct iovec*) malloc ( fh->f_num_of_io_entries *
+								  sizeof(struct iovec));
+    if (NULL == ompio_req->req_fview->f_decoded_iov) {
+        opal_output(1, "common_ompio_file_iwrite_pregen: could not allocate memory\n");
+        return OMPI_ERR_OUT_OF_RESOURCE;
+    }
+
+    ompio_req->req_fview->f_iov_count = fh->f_num_of_io_entries;
+    for (i=0; i < ompio_req->req_fview->f_iov_count; i++) {
+        ompio_req->req_fview->f_decoded_iov[i].iov_base = fh->f_io_array[i].offset;
+        ompio_req->req_fview->f_decoded_iov[i].iov_len  = fh->f_io_array[i].length ;
+    }
+
+    fh->f_num_of_io_entries = 0;
+    free (fh->f_io_array);
+    fh->f_io_array = NULL;
+
+    mca_common_ompio_post_next_write_subreq(ompio_req, 0);
+    return OMPI_SUCCESS;
+}
+
 int mca_common_ompio_file_iwrite_at (ompio_file_t *fh,
 				   OMPI_MPI_OFFSET_TYPE offset,
 				   const void *buf,
diff --git a/ompi/mca/fcoll/vulcan/fcoll_vulcan.h b/ompi/mca/fcoll/vulcan/fcoll_vulcan.h
index ebddf429a5c..a2fd6ca82bc 100644
--- a/ompi/mca/fcoll/vulcan/fcoll_vulcan.h
+++ b/ompi/mca/fcoll/vulcan/fcoll_vulcan.h
@@ -43,6 +43,7 @@ extern int mca_fcoll_vulcan_priority;
 extern int mca_fcoll_vulcan_num_groups;
 extern int mca_fcoll_vulcan_write_chunksize;
 extern int mca_fcoll_vulcan_async_io;
+extern int mca_fcoll_vulcan_use_accelerator_buffers;
 
 OMPI_DECLSPEC extern mca_fcoll_base_component_3_0_0_t mca_fcoll_vulcan_component;
 
diff --git a/ompi/mca/fcoll/vulcan/fcoll_vulcan_file_write_all.c b/ompi/mca/fcoll/vulcan/fcoll_vulcan_file_write_all.c
index 8318b0ef374..b928f795bfe 100644
--- a/ompi/mca/fcoll/vulcan/fcoll_vulcan_file_write_all.c
+++ b/ompi/mca/fcoll/vulcan/fcoll_vulcan_file_write_all.c
@@ -15,6 +15,7 @@
  * Copyright (c) 2023      Jeffrey M. Squyres.  All rights reserved.
  * Copyright (c) 2024      Triad National Security, LLC. All rights
  *                         reserved.
+ * Copyright (c) 2024      Advanced Micro Devices, Inc. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -30,10 +31,12 @@
 #include "ompi/mca/fcoll/fcoll.h"
 #include "ompi/mca/fcoll/base/fcoll_base_coll_array.h"
 #include "ompi/mca/common/ompio/common_ompio.h"
+#include "ompi/mca/common/ompio/common_ompio_buffer.h"
 #include "ompi/mca/io/io.h"
 #include "ompi/mca/common/ompio/common_ompio_request.h"
 #include "math.h"
 #include "ompi/mca/pml/pml.h"
+#include "opal/mca/accelerator/accelerator.h"
 #include <unistd.h>
 
 #define DEBUG_ON 0
@@ -88,13 +91,12 @@ typedef struct mca_io_ompio_aggregator_data {
         _aggr[_i]->prev_recvtype=(ompi_datatype_t **)_t;          }                                                             \
 }
 
-
-
 static int shuffle_init ( int index, int cycles, int aggregator, int rank, 
                           mca_io_ompio_aggregator_data *data, 
                           ompi_request_t **reqs );
 static int write_init (ompio_file_t *fh, int aggregator, mca_io_ompio_aggregator_data *aggr_data,
-                        int write_chunksize, int write_synchType, ompi_request_t **request);
+                       int write_chunksize, int write_synchType, ompi_request_t **request,
+                       bool is_accelerator_buffer);
 int mca_fcoll_vulcan_break_file_view ( struct iovec *decoded_iov, int iov_count, 
                                         struct iovec *local_iov_array, int local_count, 
                                         struct iovec ***broken_decoded_iovs, int **broken_iov_counts,
@@ -155,6 +157,8 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
 
     ompi_count_array_t fview_count_desc;
     ompi_disp_array_t displs_desc;
+    int is_gpu, is_managed;
+    bool use_accelerator_buffer = false;
 
 #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
     double write_time = 0.0, start_write_time = 0.0, end_write_time = 0.0;
@@ -180,6 +184,11 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
         goto exit;
     }
 
+    mca_common_ompio_check_gpu_buf (fh, buf, &is_gpu, &is_managed);
+    if (is_gpu && !is_managed &&
+	fh->f_get_mca_parameter_value ("use_accelerator_buffers", strlen("use_accelerator_buffers"))) {
+	use_accelerator_buffer = true;
+    }
     /* since we want to overlap 2 iterations, define the bytes_per_cycle to be half of what
        the user requested */
     bytes_per_cycle =bytes_per_cycle/2;
@@ -529,13 +538,31 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
                 goto exit;
             }
         
-            
-            aggr_data[i]->global_buf       = (char *) malloc (bytes_per_cycle);
-            aggr_data[i]->prev_global_buf  = (char *) malloc (bytes_per_cycle);
-            if (NULL == aggr_data[i]->global_buf || NULL == aggr_data[i]->prev_global_buf){
-                opal_output(1, "OUT OF MEMORY");
-                ret = OMPI_ERR_OUT_OF_RESOURCE;
-                goto exit;
+            if (use_accelerator_buffer) {
+		opal_output_verbose(10, ompi_fcoll_base_framework.framework_output,
+				    "Allocating GPU device buffer for aggregation\n");
+                ret = opal_accelerator.mem_alloc(MCA_ACCELERATOR_NO_DEVICE_ID, (void**)&aggr_data[i]->global_buf,
+                                                 bytes_per_cycle);
+                if (OPAL_SUCCESS != ret) {
+                    opal_output(1, "Could not allocate accelerator memory");
+                    ret = OMPI_ERR_OUT_OF_RESOURCE;
+                    goto exit;
+                }
+                ret = opal_accelerator.mem_alloc(MCA_ACCELERATOR_NO_DEVICE_ID, (void**)&aggr_data[i]->prev_global_buf,
+                                                 bytes_per_cycle);
+                if (OPAL_SUCCESS != ret) {
+                    opal_output(1, "Could not allocate accelerator memory");
+                    ret = OMPI_ERR_OUT_OF_RESOURCE;
+                    goto exit;
+                }
+            } else {
+                aggr_data[i]->global_buf       = (char *) malloc (bytes_per_cycle);
+                aggr_data[i]->prev_global_buf  = (char *) malloc (bytes_per_cycle);
+                if (NULL == aggr_data[i]->global_buf || NULL == aggr_data[i]->prev_global_buf){
+                    opal_output(1, "OUT OF MEMORY");
+                    ret = OMPI_ERR_OUT_OF_RESOURCE;
+                    goto exit;
+                }
             }
         
             aggr_data[i]->recvtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group  * 
@@ -605,7 +632,7 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
             start_write_time = MPI_Wtime();
 #endif
             ret = write_init (fh, fh->f_aggr_list[aggr_index], aggr_data[aggr_index],
-                              write_chunksize, write_synch_type, &req_iwrite);
+                              write_chunksize, write_synch_type, &req_iwrite, use_accelerator_buffer);
             if (OMPI_SUCCESS != ret){
                 goto exit;
             }
@@ -645,7 +672,7 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
             start_write_time = MPI_Wtime();
 #endif
             ret = write_init (fh, fh->f_aggr_list[aggr_index], aggr_data[aggr_index],
-                              write_chunksize, write_synch_type, &req_iwrite);
+                              write_chunksize, write_synch_type, &req_iwrite, use_accelerator_buffer);
             if (OMPI_SUCCESS != ret){
                 goto exit;
             }
@@ -704,8 +731,13 @@ exit :
                 
                 free (aggr_data[i]->disp_index);
                 free (aggr_data[i]->max_disp_index);
-                free (aggr_data[i]->global_buf);
-                free (aggr_data[i]->prev_global_buf);
+		if (use_accelerator_buffer) {
+		    opal_accelerator.mem_release(MCA_ACCELERATOR_NO_DEVICE_ID, aggr_data[i]->global_buf);
+		    opal_accelerator.mem_release(MCA_ACCELERATOR_NO_DEVICE_ID, aggr_data[i]->prev_global_buf);
+		} else {
+		    free (aggr_data[i]->global_buf);
+		    free (aggr_data[i]->prev_global_buf);
+		}
                 for(l=0;l<aggr_data[i]->procs_per_group;l++){
                     free (aggr_data[i]->blocklen_per_process[l]);
                     free (aggr_data[i]->displs_per_process[l]);
@@ -749,7 +781,8 @@ static int write_init (ompio_file_t *fh,
                        mca_io_ompio_aggregator_data *aggr_data,
                        int write_chunksize,
                        int write_synchType,
-                       ompi_request_t **request )
+                       ompi_request_t **request,
+                       bool is_accelerator_buffer)
 {
     int ret = OMPI_SUCCESS;
     ssize_t ret_temp = 0;
@@ -770,11 +803,20 @@ static int write_init (ompio_file_t *fh,
                                           write_chunksize);
 
         if (1 == write_synchType) {
-            ret = fh->f_fbtl->fbtl_ipwritev(fh, (ompi_request_t *) ompio_req);
-            if(0 > ret) {
-                opal_output (1, "vulcan_write_all: fbtl_ipwritev failed\n");
-                ompio_req->req_ompi.req_status.MPI_ERROR = ret;
-                ompio_req->req_ompi.req_status._ucount = 0;
+            if (is_accelerator_buffer) {
+                ret = mca_common_ompio_file_iwrite_pregen(fh, (ompi_request_t *) ompio_req);
+                if(0 > ret) {
+                    opal_output (1, "vulcan_write_all: mca_common_ompio_iwrite_pregen failed\n");
+                    ompio_req->req_ompi.req_status.MPI_ERROR = ret;
+                    ompio_req->req_ompi.req_status._ucount = 0;
+                }
+            } else {
+                ret = fh->f_fbtl->fbtl_ipwritev(fh, (ompi_request_t *) ompio_req);
+                if(0 > ret) {
+                    opal_output (1, "vulcan_write_all: fbtl_ipwritev failed\n");
+                    ompio_req->req_ompi.req_status.MPI_ERROR = ret;
+                    ompio_req->req_ompi.req_status._ucount = 0;
+                }
             }
         }
         else {
diff --git a/ompi/mca/io/ompio/io_ompio.c b/ompi/mca/io/ompio/io_ompio.c
index 2cc27e23ddc..506b6897e46 100644
--- a/ompi/mca/io/ompio/io_ompio.c
+++ b/ompi/mca/io/ompio/io_ompio.c
@@ -15,7 +15,7 @@
  * Copyright (c) 2012-2013 Inria.  All rights reserved.
  * Copyright (c) 2015-2018 Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
- * Copyright (c) 2022      Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -556,6 +556,9 @@ int mca_io_ompio_get_mca_parameter_value ( char *mca_parameter_name, int name_le
     else if ( !strncmp ( mca_parameter_name, "coll_timing_info", name_length )) {
         return mca_io_ompio_coll_timing_info;
     }
+    else if ( !strncmp (mca_parameter_name, "use_accelerator_buffers", name_length)) {
+	return mca_io_ompio_use_accelerator_buffers;
+    }
     else {
         opal_output (1, "Error in mca_io_ompio_get_mca_parameter_value: unknown parameter name");
     }
diff --git a/ompi/mca/io/ompio/io_ompio.h b/ompi/mca/io/ompio/io_ompio.h
index 272c4e4a0c6..d38bfb1acb3 100644
--- a/ompi/mca/io/ompio/io_ompio.h
+++ b/ompi/mca/io/ompio/io_ompio.h
@@ -14,7 +14,7 @@
  * Copyright (c) 2015-2018 Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
  * Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
- * Copyright (c) 2022      Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
  * Copyright (c) 2024      Triad National Security, LLC. All rights
  *                         reserved.
  * $COPYRIGHT$
@@ -57,7 +57,7 @@ extern int mca_io_ompio_max_aggregators_ratio;
 extern int mca_io_ompio_aggregators_cutoff_threshold;
 extern int mca_io_ompio_overwrite_amode;
 extern int mca_io_ompio_verbose_info_parsing;
-
+extern int mca_io_ompio_use_accelerator_buffers;
 OMPI_DECLSPEC extern int mca_io_ompio_coll_timing_info;
 
 #define QUEUESIZE 2048
diff --git a/ompi/mca/io/ompio/io_ompio_component.c b/ompi/mca/io/ompio/io_ompio_component.c
index a8151315f40..38c4e8c3105 100644
--- a/ompi/mca/io/ompio/io_ompio_component.c
+++ b/ompi/mca/io/ompio/io_ompio_component.c
@@ -17,7 +17,7 @@
  *                         and Technology (RIST).  All rights reserved.
  * Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
  * Copyright (c) 2018      DataDirect Networks. All rights reserved.
- * Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
  * Copyright (c) 2024      Triad National Security, LLC. All rights
  *                         reserved.
  * $COPYRIGHT$
@@ -49,7 +49,7 @@ int mca_io_ompio_max_aggregators_ratio=8;
 int mca_io_ompio_aggregators_cutoff_threshold=3;
 int mca_io_ompio_overwrite_amode = 1;
 int mca_io_ompio_verbose_info_parsing = 0;
-
+int mca_io_ompio_use_accelerator_buffers = 1;
 int mca_io_ompio_grouping_option=5;
 
 /*
@@ -263,6 +263,14 @@ static int register_component(void)
                                            MCA_BASE_VAR_SCOPE_READONLY,
                                            &mca_io_ompio_verbose_info_parsing);
 
+    mca_io_ompio_use_accelerator_buffers = 1;
+    (void) mca_base_component_var_register(&mca_io_ompio_component.io_version,
+                                           "use_accelerator_buffers", "Allow using accelerator buffers"
+                                           "for data aggregation in collective I/O if input buffer is device memory",
+                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                           OPAL_INFO_LVL_9,
+                                           MCA_BASE_VAR_SCOPE_READONLY, &mca_io_ompio_use_accelerator_buffers);
+
     return OMPI_SUCCESS;
 }
 

From 18bb9d718782a53b8f7ed2a9a2d751d73cfbe87e Mon Sep 17 00:00:00 2001
From: Edgar Gabriel <Edgar.Gabriel@amd.com>
Date: Mon, 22 Jul 2024 07:55:07 -0700
Subject: [PATCH 2/3] common/ompio: add support for gpu aggr. buffers for
 read_all

add support for using accelerator buffers in the aggregation step of the read_all operation.
This is in common/ompio instead of the fcoll component, since all fcoll components (except individual)
use at the momment the default implementation, which has been moved to common/ompio a while back to
avoid code duplication.

Signed-off-by: Edgar Gabriel <Edgar.Gabriel@amd.com>
---
 ompi/mca/common/ompio/common_ompio.h          |  2 +
 .../mca/common/ompio/common_ompio_file_read.c | 64 ++++++++++++++++++-
 .../common/ompio/common_ompio_file_read_all.c | 64 ++++++++++++++++---
 3 files changed, 119 insertions(+), 11 deletions(-)

diff --git a/ompi/mca/common/ompio/common_ompio.h b/ompi/mca/common/ompio/common_ompio.h
index 1ddb0b47d7f..8b12f7b8cc5 100644
--- a/ompi/mca/common/ompio/common_ompio.h
+++ b/ompi/mca/common/ompio/common_ompio.h
@@ -299,6 +299,8 @@ OMPI_DECLSPEC int mca_common_ompio_file_read_at (ompio_file_t *fh, OMPI_MPI_OFFS
 OMPI_DECLSPEC int mca_common_ompio_file_iread (ompio_file_t *fh, void *buf, size_t count,
                                                struct ompi_datatype_t *datatype, ompi_request_t **request);
 
+OMPI_DECLSPEC int mca_common_ompio_file_iread_pregen (ompio_file_t *fh, ompi_request_t *request);
+
 OMPI_DECLSPEC int mca_common_ompio_file_iread_at (ompio_file_t *fh, OMPI_MPI_OFFSET_TYPE offset,
                                                   void *buf, size_t count, struct ompi_datatype_t *datatype,
                                                   ompi_request_t **request);
diff --git a/ompi/mca/common/ompio/common_ompio_file_read.c b/ompi/mca/common/ompio/common_ompio_file_read.c
index e49cfb42b59..15a62f455c1 100644
--- a/ompi/mca/common/ompio/common_ompio_file_read.c
+++ b/ompi/mca/common/ompio/common_ompio_file_read.c
@@ -12,7 +12,7 @@
  *  Copyright (c) 2008-2019 University of Houston. All rights reserved.
  *  Copyright (c) 2018      Research Organization for Information Science
  *                          and Technology (RIST). All rights reserved.
- *  Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *  Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
  *  Copyright (c) 2024      Triad National Security, LLC. All rights
  *                          reserved.
  *  $COPYRIGHT$
@@ -530,6 +530,68 @@ int mca_common_ompio_file_iread (ompio_file_t *fh,
     return ret;
 }
 
+/*
+** This routine is invoked from file_read_all.
+** It is only used if the temporary buffer is a gpu buffer,
+** and the fbtl supports the ipreadv operation.
+**
+** The io-array has already been generated in file_read_all,
+** and we use the pre-computed offsets to created a pseudo fview.
+** The position of the file pointer is updated in the file_read_all
+** operation, not here.
+*/
+
+int mca_common_ompio_file_iread_pregen (ompio_file_t *fh,
+                                        ompi_request_t *request)
+{
+    uint32_t i;
+    size_t max_data;
+    size_t pipeline_buf_size;
+    mca_ompio_request_t *ompio_req = (mca_ompio_request_t *) request;
+
+    max_data = fh->f_io_array[0].length;
+    pipeline_buf_size = OMPIO_MCA_GET(fh, pipeline_buffer_size);
+
+    mca_common_ompio_register_progress ();
+
+    OMPIO_PREPARE_READ_BUF (fh, fh->f_io_array[0].memory_address, max_data, MPI_BYTE,
+                            ompio_req->req_tbuf, &ompio_req->req_convertor, max_data,
+                            pipeline_buf_size, NULL, i);
+
+    ompio_req->req_num_subreqs = ceil((double)max_data/pipeline_buf_size);
+    ompio_req->req_size        = pipeline_buf_size;
+    ompio_req->req_max_data    = max_data;
+    ompio_req->req_post_next_subreq = mca_common_ompio_post_next_read_subreq;
+    ompio_req->req_fh          = fh;
+    ompio_req->req_ompi.req_status.MPI_ERROR = MPI_SUCCESS;
+
+    ompio_req->req_fview = (struct ompio_fview_t *) calloc(1, sizeof(struct ompio_fview_t));
+    if (NULL == ompio_req->req_fview) {
+        opal_output(1, "common_ompio: error allocating memory\n");
+        return OMPI_ERR_OUT_OF_RESOURCE;
+    }
+
+   ompio_req->req_fview->f_decoded_iov = (struct iovec*) malloc (fh->f_num_of_io_entries *
+                                                                 sizeof(struct iovec));
+    if (NULL == ompio_req->req_fview->f_decoded_iov) {
+        opal_output(1, "common_ompio_file_iread_pregen: could not allocate memory\n");
+        return OMPI_ERR_OUT_OF_RESOURCE;
+    }
+
+    ompio_req->req_fview->f_iov_count = fh->f_num_of_io_entries;
+    for (i=0; i < ompio_req->req_fview->f_iov_count; i++) {
+        ompio_req->req_fview->f_decoded_iov[i].iov_base = fh->f_io_array[i].offset;
+        ompio_req->req_fview->f_decoded_iov[i].iov_len  = fh->f_io_array[i].length ;
+    }
+
+    fh->f_num_of_io_entries = 0;
+    free (fh->f_io_array);
+    fh->f_io_array = NULL;
+
+    mca_common_ompio_post_next_read_subreq(ompio_req, 0);
+    return OMPI_SUCCESS;
+}
+
 int mca_common_ompio_file_iread_at (ompio_file_t *fh,
 				  OMPI_MPI_OFFSET_TYPE offset,
 				  void *buf,
diff --git a/ompi/mca/common/ompio/common_ompio_file_read_all.c b/ompi/mca/common/ompio/common_ompio_file_read_all.c
index 6cfd5127033..1b2f8d6c474 100644
--- a/ompi/mca/common/ompio/common_ompio_file_read_all.c
+++ b/ompi/mca/common/ompio/common_ompio_file_read_all.c
@@ -15,6 +15,7 @@
  * Copyright (c) 2023      Jeffrey M. Squyres.  All rights reserved.
  * Copyright (c) 2024      Triad National Security, LLC. All rights
  *                         reserved.
+ * Copyright (c) 2024      Advanced Micro Devices, Inc. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -30,9 +31,12 @@
 #include "ompi/mca/fcoll/base/fcoll_base_coll_array.h"
 #include "ompi/mca/fcoll/base/base.h"
 #include "ompi/mca/common/ompio/common_ompio.h"
+#include "ompi/mca/common/ompio/common_ompio_request.h"
+#include "ompi/mca/common/ompio/common_ompio_buffer.h"
 #include "ompi/mca/io/io.h"
 #include "math.h"
 #include "ompi/mca/pml/pml.h"
+#include "opal/mca/accelerator/accelerator.h"
 #include <unistd.h>
 
 #define DEBUG_ON 0
@@ -106,6 +110,9 @@ mca_common_ompio_base_file_read_all (struct ompio_file_t *fh,
     int* blocklength_proc       = NULL;
     ptrdiff_t* displs_proc      = NULL;
 
+    int is_gpu, is_managed;
+    bool use_accelerator_buffer = false;
+
 #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
     double read_time = 0.0, start_read_time = 0.0, end_read_time = 0.0;
     double rcomm_time = 0.0, start_rcomm_time = 0.0, end_rcomm_time = 0.0;
@@ -138,6 +145,12 @@ mca_common_ompio_base_file_read_all (struct ompio_file_t *fh,
         goto exit;
     }
 
+    mca_common_ompio_check_gpu_buf (fh, buf, &is_gpu, &is_managed);
+    if (is_gpu && !is_managed && NULL != fh->f_fbtl->fbtl_ipreadv &&
+	fh->f_get_mca_parameter_value ("use_accelerator_buffers", strlen("use_accelerator_buffers"))) {
+	use_accelerator_buffer = true;
+    }
+
     ret = mca_common_ompio_set_aggregator_props ((struct ompio_file_t *) fh,
                                                  base_num_io_procs,
                                                  max_data);
@@ -364,11 +377,22 @@ mca_common_ompio_base_file_read_all (struct ompio_file_t *fh,
 	    goto exit;
 	}
 
-	global_buf = (char *) malloc (bytes_per_cycle);
-	if (NULL == global_buf){
-	    opal_output(1, "OUT OF MEMORY\n");
-	    ret = OMPI_ERR_OUT_OF_RESOURCE;
-	    goto exit;
+        if (use_accelerator_buffer) {
+            opal_output_verbose(10, ompi_fcoll_base_framework.framework_output,
+                                "Allocating GPU device buffer for aggregation\n");
+            ret = opal_accelerator.mem_alloc(MCA_ACCELERATOR_NO_DEVICE_ID, (void**)&global_buf,
+                                             bytes_per_cycle);
+            if (OPAL_SUCCESS != ret) {
+                opal_output(1, "Could not allocate accelerator memory");
+                ret = OMPI_ERR_OUT_OF_RESOURCE;
+                goto exit;
+            }
+        } else {global_buf = (char *) malloc (bytes_per_cycle);
+            if (NULL == global_buf){
+                opal_output(1, "OUT OF MEMORY\n");
+                ret = OMPI_ERR_OUT_OF_RESOURCE;
+                goto exit;
+            }
 	}
 
 	sendtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group * sizeof(ompi_datatype_t *));
@@ -686,10 +710,26 @@ mca_common_ompio_base_file_read_all (struct ompio_file_t *fh,
 #endif
 
             if (fh->f_num_of_io_entries) {
-                if ( 0 >  fh->f_fbtl->fbtl_preadv (fh)) {
-                    opal_output (1, "READ FAILED\n");
-                    ret = OMPI_ERROR;
-                    goto exit;
+                if (use_accelerator_buffer) {
+		    mca_ompio_request_t *ompio_req = NULL;
+		    mca_common_ompio_request_alloc (&ompio_req, MCA_OMPIO_REQUEST_READ);
+
+                    ret = mca_common_ompio_file_iread_pregen(fh, (ompi_request_t *) ompio_req);
+                    if(0 > ret) {
+                        opal_output (1, "common_ompio_file_read_all: mca_common_ompio_iread_pregen failed\n");
+                        ompio_req->req_ompi.req_status.MPI_ERROR = ret;
+                        ompio_req->req_ompi.req_status._ucount = 0;
+                    }
+		    ret = ompi_request_wait ((ompi_request_t**)&ompio_req, MPI_STATUS_IGNORE);
+		    if (OMPI_SUCCESS != ret){
+			goto exit;
+		    }
+                } else {
+                    if ( 0 >  fh->f_fbtl->fbtl_preadv (fh)) {
+                        opal_output (1, "READ FAILED\n");
+                        ret = OMPI_ERROR;
+                        goto exit;
+                    }
                 }
             }
 
@@ -881,7 +921,11 @@ mca_common_ompio_base_file_read_all (struct ompio_file_t *fh,
 
 exit:
     if (NULL != global_buf) {
-        free (global_buf);
+	if (use_accelerator_buffer) {
+	    opal_accelerator.mem_release(MCA_ACCELERATOR_NO_DEVICE_ID, global_buf);
+	} else {
+	    free (global_buf);
+	}
         global_buf = NULL;
     }
     if (NULL != sorted) {

From d30471c1f62d47c47a467d72b011c9e4d450596a Mon Sep 17 00:00:00 2001
From: Edgar Gabriel <Edgar.Gabriel@amd.com>
Date: Mon, 22 Jul 2024 07:56:40 -0700
Subject: [PATCH 3/3] io/ompio: turn accel. aggr. buffer off by default

performance measurements indicate that in most cases using a CPU host
buffer for data aggregation will lead to better performance than using a
GPU buffer. So turn the feature off by default.

Signed-off-by: Edgar Gabriel <Edgar.Gabriel@amd.com>
---
 ompi/mca/io/ompio/io_ompio_component.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ompi/mca/io/ompio/io_ompio_component.c b/ompi/mca/io/ompio/io_ompio_component.c
index 38c4e8c3105..0e6fcd01f6c 100644
--- a/ompi/mca/io/ompio/io_ompio_component.c
+++ b/ompi/mca/io/ompio/io_ompio_component.c
@@ -49,7 +49,7 @@ int mca_io_ompio_max_aggregators_ratio=8;
 int mca_io_ompio_aggregators_cutoff_threshold=3;
 int mca_io_ompio_overwrite_amode = 1;
 int mca_io_ompio_verbose_info_parsing = 0;
-int mca_io_ompio_use_accelerator_buffers = 1;
+int mca_io_ompio_use_accelerator_buffers = 0;
 int mca_io_ompio_grouping_option=5;
 
 /*
@@ -263,7 +263,7 @@ static int register_component(void)
                                            MCA_BASE_VAR_SCOPE_READONLY,
                                            &mca_io_ompio_verbose_info_parsing);
 
-    mca_io_ompio_use_accelerator_buffers = 1;
+    mca_io_ompio_use_accelerator_buffers = 0;
     (void) mca_base_component_var_register(&mca_io_ompio_component.io_version,
                                            "use_accelerator_buffers", "Allow using accelerator buffers"
                                            "for data aggregation in collective I/O if input buffer is device memory",