Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Topic/fix ob1 segmentation with UCT BTL #12823

Merged
merged 3 commits into from
Sep 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ompi/mca/pml/ob1/pml_ob1_isend.c
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ static inline int mca_pml_ob1_send_inline (const void *buf, size_t count,
}

if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
return rc;
return rc;
}

return (int) size;
Expand Down
2 changes: 1 addition & 1 deletion opal/datatype/opal_datatype_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -539,7 +539,7 @@ struct opal_datatype_t;
# define OPAL_DATATYPE_SAFEGUARD_POINTER(ACTPTR, LENGTH, INITPTR, PDATA, COUNT) \
{ \
unsigned char *__lower_bound = (INITPTR), *__upper_bound; \
assert(((LENGTH) != 0) && ((COUNT) != 0)); \
assert( (COUNT) != 0 ); \
__lower_bound += (PDATA)->true_lb; \
__upper_bound = (INITPTR) + (PDATA)->true_ub + \
((PDATA)->ub - (PDATA)->lb) * ((COUNT) -1); \
Expand Down
7 changes: 4 additions & 3 deletions opal/datatype/opal_datatype_position.c
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ static inline void position_single_block(opal_convertor_t *CONVERTOR, unsigned c
}

/**
* Advance the convertors' position according. Update the pointer and the remaining space
* accordingly.
* Advance the convertors' position according to account for *COUNT elements. Update
* the pointer and the remaining space accordingly.
*/
static inline void position_predefined_data(opal_convertor_t *CONVERTOR, dt_elem_desc_t *ELEM,
size_t *COUNT, unsigned char **POINTER, size_t *SPACE)
Expand All @@ -82,7 +82,8 @@ static inline void position_predefined_data(opal_convertor_t *CONVERTOR, dt_elem

if (cando_count > *(COUNT)) {
cando_count = *(COUNT);
}
} else if( 0 == cando_count )
return;

if (1 == _elem->blocklen) {
DO_DEBUG(opal_output(0,
Expand Down
14 changes: 0 additions & 14 deletions opal/mca/btl/sm/btl_sm_send.c
Original file line number Diff line number Diff line change
Expand Up @@ -73,18 +73,4 @@ int mca_btl_sm_send(struct mca_btl_base_module_t *btl, struct mca_btl_base_endpo
}

return OPAL_SUCCESS;

#if 0
if (((frag->hdr->flags & MCA_BTL_SM_FLAG_SINGLE_COPY) ||
!(frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP)) &&
frag->base.des_cbfunc) {
frag->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;

return OPAL_SUCCESS;
}

/* data is gone (from the pml's perspective). frag callback/release will
happen later */
return 1;
#endif
}
30 changes: 20 additions & 10 deletions opal/mca/btl/uct/btl_uct_am.c
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ mca_btl_base_descriptor_t *mca_btl_uct_alloc(mca_btl_base_module_t *btl,
}

static inline void _mca_btl_uct_send_pack(void *data, void *header, size_t header_size,
opal_convertor_t *convertor, size_t payload_size)
opal_convertor_t *convertor, size_t* payload_size)
{
uint32_t iov_count = 1;
struct iovec iov;
Expand All @@ -64,11 +64,9 @@ static inline void _mca_btl_uct_send_pack(void *data, void *header, size_t heade

/* pack the data into the supplied buffer */
iov.iov_base = (IOVBASE_TYPE *) ((intptr_t) data + header_size);
iov.iov_len = length = payload_size;
iov.iov_len = *payload_size;

(void) opal_convertor_pack(convertor, &iov, &iov_count, &length);

assert(length == payload_size);
(void) opal_convertor_pack(convertor, &iov, &iov_count, payload_size);
}

struct mca_btl_base_descriptor_t *mca_btl_uct_prepare_src(mca_btl_base_module_t *btl,
Expand All @@ -92,7 +90,10 @@ struct mca_btl_base_descriptor_t *mca_btl_uct_prepare_src(mca_btl_base_module_t
}

_mca_btl_uct_send_pack((void *) ((intptr_t) frag->uct_iov.buffer + reserve), NULL, 0,
convertor, *size);
convertor, size);
/* update the length of the fragment according to the convertor packed data */
frag->segments[0].seg_len = reserve + *size;
frag->uct_iov.length = frag->segments[0].seg_len;
} else {
opal_convertor_get_current_pointer(convertor, &data_ptr);
assert(NULL != data_ptr);
Expand Down Expand Up @@ -286,7 +287,7 @@ static size_t mca_btl_uct_sendi_pack(void *data, void *arg)

am_header->value = args->am_header;
_mca_btl_uct_send_pack((void *) ((intptr_t) data + 8), args->header, args->header_size,
args->convertor, args->payload_size);
args->convertor, &args->payload_size);
return args->header_size + args->payload_size + 8;
}

Expand Down Expand Up @@ -329,9 +330,18 @@ int mca_btl_uct_sendi(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpo
} else if (msg_size < (size_t) MCA_BTL_UCT_TL_ATTR(uct_btl->am_tl, context->context_id)
.cap.am.max_short) {
int8_t *data = alloca(total_size);
_mca_btl_uct_send_pack(data, header, header_size, convertor, payload_size);
ucs_status = uct_ep_am_short(ep_handle, MCA_BTL_UCT_FRAG, am_header.value, data,
total_size);
size_t packed_payload_size = payload_size;
_mca_btl_uct_send_pack(data, header, header_size, convertor, &packed_payload_size);
if (packed_payload_size != payload_size) {
/* This should never happen as the packed data should go in a single pack. But
in case it does, fallback onto a descriptor allocation and let the caller
send the data.
*/
ucs_status = UCS_ERR_NO_RESOURCE;
} else {
ucs_status = uct_ep_am_short(ep_handle, MCA_BTL_UCT_FRAG, am_header.value, data,
total_size);
}
} else {
ssize_t size;

Expand Down
Loading