fcoll/vulcan: add support for GPU aggregation buffers

If the user user input buffers are GPU device memory, use also GPU device memory for the aggregation step. This will allow the data transfer to occur between GPU buffers, and hence take advantage of the much higher GPU-GPU interconnects (e.g. XGMI, NVLINK, etc.). The downside of this approach is that we cannot call directly into the fbtl ipwritev routine, but have to go through the common_ompio_file_iwrite_pregen routine, which performs the necessary segmenting and staging through the host memory. Signed-off-by: Edgar Gabriel <edgar.gabriel@amd.com>
open-mpi · Jul 18, 2024 · 8b24867 · 8b24867
1 parent a6d5b36
commit 8b24867
Show file tree

Hide file tree

Showing 7 changed files with 149 additions and 26 deletions.
diff --git a/ompi/mca/common/ompio/common_ompio.h b/ompi/mca/common/ompio/common_ompio.h
@@ -262,6 +262,8 @@ OMPI_DECLSPEC int mca_common_ompio_file_write_at (ompio_file_t *fh, OMPI_MPI_OFF
 OMPI_DECLSPEC int mca_common_ompio_file_iwrite (ompio_file_t *fh, const void *buf, size_t count,
                                                 struct ompi_datatype_t *datatype, ompi_request_t **request);
 
+OMPI_DECLSPEC int mca_common_ompio_file_iwrite_pregen (ompio_file_t *fh, ompi_request_t *request);
+
 OMPI_DECLSPEC int mca_common_ompio_file_iwrite_at (ompio_file_t *fh,  OMPI_MPI_OFFSET_TYPE offset,
                                                    const void *buf,  size_t count,  struct ompi_datatype_t *datatype,
                                                    ompi_request_t **request);

diff --git a/ompi/mca/common/ompio/common_ompio_file_write.c b/ompi/mca/common/ompio/common_ompio_file_write.c
@@ -12,7 +12,7 @@
  * Copyright (c) 2008-2019 University of Houston. All rights reserved.
  * Copyright (c) 2015-2018 Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
- * Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
  * Copyright (c) 2024      Triad National Security, LLC. All rights
  *                         reserved.
  * $COPYRIGHT$
@@ -329,6 +329,7 @@ static void mca_common_ompio_post_next_write_subreq(struct mca_ompio_request_t *
     decoded_iov.iov_base = req->req_tbuf;
     decoded_iov.iov_len  = req->req_size;
     opal_convertor_pack (&req->req_convertor, &decoded_iov, &iov_count, &pos);
+
     mca_common_ompio_build_io_array (req->req_fview, index, req->req_num_subreqs,
                                      bytes_per_cycle, pos,
                                      iov_count, &decoded_iov,
@@ -472,6 +473,72 @@ int mca_common_ompio_file_iwrite (ompio_file_t *fh,
     return ret;
 }
 
+/*
+** This routine is invoked from the fcoll component.
+** It is only used if the temporary buffer is a gpu buffer,
+** and the fbtl supports the ipwritev operation.
+**
+** The io-array has already been generated in fcoll/xxx/file_write_all,
+** and we use the pre-computed offsets to created a pseudo fview.
+** The position of the file pointer is updated in the fcoll
+** component, not here.
+*/
+
+int mca_common_ompio_file_iwrite_pregen (ompio_file_t *fh,
+					 ompi_request_t *request)
+{
+    uint32_t i;
+    size_t max_data;
+    size_t pipeline_buf_size;
+    mca_ompio_request_t *ompio_req = (mca_ompio_request_t *) request;
+
+    if (NULL == fh->f_fbtl->fbtl_ipwritev) {
+	return MPI_ERR_INTERN;
+    }
+
+    max_data = fh->f_io_array[0].length;
+    pipeline_buf_size = OMPIO_MCA_GET(fh, pipeline_buffer_size);
+
+    mca_common_ompio_register_progress ();
+
+    OMPIO_PREPARE_BUF (fh, fh->f_io_array[0].memory_address, max_data, MPI_BYTE,
+		       ompio_req->req_tbuf, &ompio_req->req_convertor, max_data,
+		       pipeline_buf_size, NULL, i);
+
+    ompio_req->req_num_subreqs = ceil((double)max_data/pipeline_buf_size);
+    ompio_req->req_size        = pipeline_buf_size;
+    ompio_req->req_max_data    = max_data;
+    ompio_req->req_post_next_subreq = mca_common_ompio_post_next_write_subreq;
+    ompio_req->req_fh          = fh;
+    ompio_req->req_ompi.req_status.MPI_ERROR = MPI_SUCCESS;
+
+    ompio_req->req_fview = (struct ompio_fview_t *) calloc(1, sizeof(struct ompio_fview_t));
+    if (NULL == ompio_req->req_fview) {
+	opal_output(1, "common_ompio: error allocating memory\n");
+	return OMPI_ERR_OUT_OF_RESOURCE;
+    }
+
+   ompio_req->req_fview->f_decoded_iov = (struct iovec*) malloc ( fh->f_num_of_io_entries *
+								  sizeof(struct iovec));
+    if (NULL == ompio_req->req_fview->f_decoded_iov) {
+        opal_output(1, "common_ompio_file_iwrite_pregen: could not allocate memory\n");
+        return OMPI_ERR_OUT_OF_RESOURCE;
+    }
+
+    ompio_req->req_fview->f_iov_count = fh->f_num_of_io_entries;
+    for (i=0; i < ompio_req->req_fview->f_iov_count; i++) {
+        ompio_req->req_fview->f_decoded_iov[i].iov_base = fh->f_io_array[i].offset;
+        ompio_req->req_fview->f_decoded_iov[i].iov_len  = fh->f_io_array[i].length ;
+    }
+
+    fh->f_num_of_io_entries = 0;
+    free (fh->f_io_array);
+    fh->f_io_array = NULL;
+
+    mca_common_ompio_post_next_write_subreq(ompio_req, 0);
+    return OMPI_SUCCESS;
+}
+
 int mca_common_ompio_file_iwrite_at (ompio_file_t *fh,
 				   OMPI_MPI_OFFSET_TYPE offset,
 				   const void *buf,

diff --git a/ompi/mca/fcoll/vulcan/fcoll_vulcan.h b/ompi/mca/fcoll/vulcan/fcoll_vulcan.h
@@ -43,6 +43,7 @@ extern int mca_fcoll_vulcan_priority;
 extern int mca_fcoll_vulcan_num_groups;
 extern int mca_fcoll_vulcan_write_chunksize;
 extern int mca_fcoll_vulcan_async_io;
+extern int mca_fcoll_vulcan_use_accelerator_buffers;
 
 OMPI_DECLSPEC extern mca_fcoll_base_component_3_0_0_t mca_fcoll_vulcan_component;
 

diff --git a/ompi/mca/fcoll/vulcan/fcoll_vulcan_file_write_all.c b/ompi/mca/fcoll/vulcan/fcoll_vulcan_file_write_all.c
@@ -15,6 +15,7 @@
  * Copyright (c) 2023      Jeffrey M. Squyres.  All rights reserved.
  * Copyright (c) 2024      Triad National Security, LLC. All rights
  *                         reserved.
+ * Copyright (c) 2024      Advanced Micro Devices, Inc. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -30,10 +31,12 @@
 #include "ompi/mca/fcoll/fcoll.h"
 #include "ompi/mca/fcoll/base/fcoll_base_coll_array.h"
 #include "ompi/mca/common/ompio/common_ompio.h"
+#include "ompi/mca/common/ompio/common_ompio_buffer.h"
 #include "ompi/mca/io/io.h"
 #include "ompi/mca/common/ompio/common_ompio_request.h"
 #include "math.h"
 #include "ompi/mca/pml/pml.h"
+#include "opal/mca/accelerator/accelerator.h"
 #include <unistd.h>
 
 #define DEBUG_ON 0
@@ -88,13 +91,12 @@ typedef struct mca_io_ompio_aggregator_data {
         _aggr[_i]->prev_recvtype=(ompi_datatype_t **)_t;          }                                                             \
 }
 
-
-
 static int shuffle_init ( int index, int cycles, int aggregator, int rank, 
                           mca_io_ompio_aggregator_data *data, 
                           ompi_request_t **reqs );
 static int write_init (ompio_file_t *fh, int aggregator, mca_io_ompio_aggregator_data *aggr_data,
-                        int write_chunksize, int write_synchType, ompi_request_t **request);
+                       int write_chunksize, int write_synchType, ompi_request_t **request,
+                       bool is_accelerator_buffer);
 int mca_fcoll_vulcan_break_file_view ( struct iovec *decoded_iov, int iov_count, 
                                         struct iovec *local_iov_array, int local_count, 
                                         struct iovec ***broken_decoded_iovs, int **broken_iov_counts,
@@ -155,6 +157,8 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
 
     ompi_count_array_t fview_count_desc;
     ompi_disp_array_t displs_desc;
+    int is_gpu, is_managed;
+    bool use_accelerator_buffer = false;
 
 #if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
     double write_time = 0.0, start_write_time = 0.0, end_write_time = 0.0;
@@ -180,6 +184,11 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
         goto exit;
     }
 
+    mca_common_ompio_check_gpu_buf (fh, buf, &is_gpu, &is_managed);
+    if (is_gpu && !is_managed &&
+	fh->f_get_mca_parameter_value ("use_accelerator_buffers", strlen("use_accelerator_buffers"))) {
+	use_accelerator_buffer = true;
+    }
     /* since we want to overlap 2 iterations, define the bytes_per_cycle to be half of what
        the user requested */
     bytes_per_cycle =bytes_per_cycle/2;
@@ -529,13 +538,31 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
                 goto exit;
             }
 
-
-            aggr_data[i]->global_buf       = (char *) malloc (bytes_per_cycle);
-            aggr_data[i]->prev_global_buf  = (char *) malloc (bytes_per_cycle);
-            if (NULL == aggr_data[i]->global_buf || NULL == aggr_data[i]->prev_global_buf){
-                opal_output(1, "OUT OF MEMORY");
-                ret = OMPI_ERR_OUT_OF_RESOURCE;
-                goto exit;
+            if (use_accelerator_buffer) {
+		opal_output_verbose(10, ompi_fcoll_base_framework.framework_output,
+				    "Allocating GPU device buffer for aggregation\n");
+                ret = opal_accelerator.mem_alloc(MCA_ACCELERATOR_NO_DEVICE_ID, (void**)&aggr_data[i]->global_buf,
+                                                 bytes_per_cycle);
+                if (OPAL_SUCCESS != ret) {
+                    opal_output(1, "Could not allocate accelerator memory");
+                    ret = OMPI_ERR_OUT_OF_RESOURCE;
+                    goto exit;
+                }
+                ret = opal_accelerator.mem_alloc(MCA_ACCELERATOR_NO_DEVICE_ID, (void**)&aggr_data[i]->prev_global_buf,
+                                                 bytes_per_cycle);
+                if (OPAL_SUCCESS != ret) {
+                    opal_output(1, "Could not allocate accelerator memory");
+                    ret = OMPI_ERR_OUT_OF_RESOURCE;
+                    goto exit;
+                }
+            } else {
+                aggr_data[i]->global_buf       = (char *) malloc (bytes_per_cycle);
+                aggr_data[i]->prev_global_buf  = (char *) malloc (bytes_per_cycle);
+                if (NULL == aggr_data[i]->global_buf || NULL == aggr_data[i]->prev_global_buf){
+                    opal_output(1, "OUT OF MEMORY");
+                    ret = OMPI_ERR_OUT_OF_RESOURCE;
+                    goto exit;
+                }
             }
 
             aggr_data[i]->recvtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group  * 
@@ -605,7 +632,7 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
             start_write_time = MPI_Wtime();
 #endif
             ret = write_init (fh, fh->f_aggr_list[aggr_index], aggr_data[aggr_index],
-                              write_chunksize, write_synch_type, &req_iwrite);
+                              write_chunksize, write_synch_type, &req_iwrite, use_accelerator_buffer);
             if (OMPI_SUCCESS != ret){
                 goto exit;
             }
@@ -645,7 +672,7 @@ int mca_fcoll_vulcan_file_write_all (struct ompio_file_t *fh,
             start_write_time = MPI_Wtime();
 #endif
             ret = write_init (fh, fh->f_aggr_list[aggr_index], aggr_data[aggr_index],
-                              write_chunksize, write_synch_type, &req_iwrite);
+                              write_chunksize, write_synch_type, &req_iwrite, use_accelerator_buffer);
             if (OMPI_SUCCESS != ret){
                 goto exit;
             }
@@ -704,8 +731,13 @@ exit :
 
                 free (aggr_data[i]->disp_index);
                 free (aggr_data[i]->max_disp_index);
-                free (aggr_data[i]->global_buf);
-                free (aggr_data[i]->prev_global_buf);
+		if (use_accelerator_buffer) {
+		    opal_accelerator.mem_release(MCA_ACCELERATOR_NO_DEVICE_ID, aggr_data[i]->global_buf);
+		    opal_accelerator.mem_release(MCA_ACCELERATOR_NO_DEVICE_ID, aggr_data[i]->prev_global_buf);
+		} else {
+		    free (aggr_data[i]->global_buf);
+		    free (aggr_data[i]->prev_global_buf);
+		}
                 for(l=0;l<aggr_data[i]->procs_per_group;l++){
                     free (aggr_data[i]->blocklen_per_process[l]);
                     free (aggr_data[i]->displs_per_process[l]);
@@ -749,7 +781,8 @@ static int write_init (ompio_file_t *fh,
                        mca_io_ompio_aggregator_data *aggr_data,
                        int write_chunksize,
                        int write_synchType,
-                       ompi_request_t **request )
+                       ompi_request_t **request,
+                       bool is_accelerator_buffer)
 {
     int ret = OMPI_SUCCESS;
     ssize_t ret_temp = 0;
@@ -770,11 +803,20 @@ static int write_init (ompio_file_t *fh,
                                           write_chunksize);
 
         if (1 == write_synchType) {
-            ret = fh->f_fbtl->fbtl_ipwritev(fh, (ompi_request_t *) ompio_req);
-            if(0 > ret) {
-                opal_output (1, "vulcan_write_all: fbtl_ipwritev failed\n");
-                ompio_req->req_ompi.req_status.MPI_ERROR = ret;
-                ompio_req->req_ompi.req_status._ucount = 0;
+            if (is_accelerator_buffer) {
+                ret = mca_common_ompio_file_iwrite_pregen(fh, (ompi_request_t *) ompio_req);
+                if(0 > ret) {
+                    opal_output (1, "vulcan_write_all: mca_common_ompio_iwrite_pregen failed\n");
+                    ompio_req->req_ompi.req_status.MPI_ERROR = ret;
+                    ompio_req->req_ompi.req_status._ucount = 0;
+                }
+            } else {
+                ret = fh->f_fbtl->fbtl_ipwritev(fh, (ompi_request_t *) ompio_req);
+                if(0 > ret) {
+                    opal_output (1, "vulcan_write_all: fbtl_ipwritev failed\n");
+                    ompio_req->req_ompi.req_status.MPI_ERROR = ret;
+                    ompio_req->req_ompi.req_status._ucount = 0;
+                }
             }
         }
         else {

diff --git a/ompi/mca/io/ompio/io_ompio.c b/ompi/mca/io/ompio/io_ompio.c
@@ -15,7 +15,7 @@
  * Copyright (c) 2012-2013 Inria.  All rights reserved.
  * Copyright (c) 2015-2018 Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
- * Copyright (c) 2022      Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -556,6 +556,9 @@ int mca_io_ompio_get_mca_parameter_value ( char *mca_parameter_name, int name_le
     else if ( !strncmp ( mca_parameter_name, "coll_timing_info", name_length )) {
         return mca_io_ompio_coll_timing_info;
     }
+    else if ( !strncmp (mca_parameter_name, "use_accelerator_buffers", name_length)) {
+	return mca_io_ompio_use_accelerator_buffers;
+    }
     else {
         opal_output (1, "Error in mca_io_ompio_get_mca_parameter_value: unknown parameter name");
     }

diff --git a/ompi/mca/io/ompio/io_ompio.h b/ompi/mca/io/ompio/io_ompio.h
@@ -14,7 +14,7 @@
  * Copyright (c) 2015-2018 Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
  * Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
- * Copyright (c) 2022      Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
  * Copyright (c) 2024      Triad National Security, LLC. All rights
  *                         reserved.
  * $COPYRIGHT$
@@ -57,7 +57,7 @@ extern int mca_io_ompio_max_aggregators_ratio;
 extern int mca_io_ompio_aggregators_cutoff_threshold;
 extern int mca_io_ompio_overwrite_amode;
 extern int mca_io_ompio_verbose_info_parsing;
-
+extern int mca_io_ompio_use_accelerator_buffers;
 OMPI_DECLSPEC extern int mca_io_ompio_coll_timing_info;
 
 #define QUEUESIZE 2048

diff --git a/ompi/mca/io/ompio/io_ompio_component.c b/ompi/mca/io/ompio/io_ompio_component.c
@@ -17,7 +17,7 @@
  *                         and Technology (RIST).  All rights reserved.
  * Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
  * Copyright (c) 2018      DataDirect Networks. All rights reserved.
- * Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
  * Copyright (c) 2024      Triad National Security, LLC. All rights
  *                         reserved.
  * $COPYRIGHT$
@@ -49,7 +49,7 @@ int mca_io_ompio_max_aggregators_ratio=8;
 int mca_io_ompio_aggregators_cutoff_threshold=3;
 int mca_io_ompio_overwrite_amode = 1;
 int mca_io_ompio_verbose_info_parsing = 0;
-
+int mca_io_ompio_use_accelerator_buffers = 1;
 int mca_io_ompio_grouping_option=5;
 
 /*
@@ -263,6 +263,14 @@ static int register_component(void)
                                            MCA_BASE_VAR_SCOPE_READONLY,
                                            &mca_io_ompio_verbose_info_parsing);
 
+    mca_io_ompio_use_accelerator_buffers = 1;
+    (void) mca_base_component_var_register(&mca_io_ompio_component.io_version,
+                                           "use_accelerator_buffers", "Allow using accelerator buffers"
+                                           "for data aggregation in collective I/O if input buffer is device memory",
+                                           MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
+                                           OPAL_INFO_LVL_9,
+                                           MCA_BASE_VAR_SCOPE_READONLY, &mca_io_ompio_use_accelerator_buffers);
+
     return OMPI_SUCCESS;
 }