Skip to content

Commit

Permalink
ml/cnxk: updates to cn10k error handling
Browse files Browse the repository at this point in the history
Renamed cnxk error codes as cn10k error codes. Added
support for model specific op_error_get routines.

Signed-off-by: Srikanth Yalavarthi <syalavarthi@marvell.com>
  • Loading branch information
syalavarthi authored and jerinjacobk committed Sep 19, 2024
1 parent ecfc8af commit 986755f
Show file tree
Hide file tree
Showing 11 changed files with 71 additions and 39 deletions.
8 changes: 8 additions & 0 deletions drivers/ml/cnxk/cn10k_ml_dev.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,14 @@ static const char *const valid_args[] = {CN10K_ML_FW_PATH,
/* Supported OCM page sizes: 1KB, 2KB, 4KB, 8KB and 16KB */
static const int valid_ocm_page_size[] = {1024, 2048, 4096, 8192, 16384};

/* Error type database */
struct cn10k_ml_error_db ml_etype_db[] = {
{ML_CN10K_ETYPE_NO_ERROR, "NO_ERROR"}, {ML_CN10K_ETYPE_FW_NONFATAL, "FW_NON_FATAL"},
{ML_CN10K_ETYPE_HW_NONFATAL, "HW_NON_FATAL"}, {ML_CN10K_ETYPE_HW_FATAL, "HW_FATAL"},
{ML_CN10K_ETYPE_HW_WARNING, "HW_WARNING"}, {ML_CN10K_ETYPE_DRIVER, "DRIVER_ERROR"},
{ML_CN10K_ETYPE_UNKNOWN, "UNKNOWN_ERROR"},
};

static int
parse_string_arg(const char *key __rte_unused, const char *value, void *extra_args)
{
Expand Down
16 changes: 16 additions & 0 deletions drivers/ml/cnxk/cn10k_ml_dev.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,22 @@ struct cnxk_ml_dev;
struct cnxk_ml_req;
struct cnxk_ml_qp;

/* Error types enumeration */
enum cn10k_ml_error_etype {
/* 0x0 */ ML_CN10K_ETYPE_NO_ERROR = 0, /* No error */
/* 0x1 */ ML_CN10K_ETYPE_FW_NONFATAL, /* Firmware non-fatal error */
/* 0x2 */ ML_CN10K_ETYPE_HW_NONFATAL, /* Hardware non-fatal error */
/* 0x3 */ ML_CN10K_ETYPE_HW_FATAL, /* Hardware fatal error */
/* 0x4 */ ML_CN10K_ETYPE_HW_WARNING, /* Hardware warning */
/* 0x5 */ ML_CN10K_ETYPE_DRIVER, /* Driver specific error */
/* 0x6 */ ML_CN10K_ETYPE_UNKNOWN, /* Unknown error */
};

struct cn10k_ml_error_db {
uint64_t code;
char str[RTE_ML_STR_MAX];
};

/* Firmware non-fatal error sub-type */
enum cn10k_ml_error_stype_fw_nf {
/* 0x0 */ ML_CN10K_FW_ERR_NOERR = 0, /* No error */
Expand Down
20 changes: 11 additions & 9 deletions drivers/ml/cnxk/cn10k_ml_ops.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#define ML_FLAGS_SSO_COMPL BIT(1)

/* Hardware non-fatal error subtype database */
static struct cnxk_ml_error_db ml_stype_db_hw_nf[] = {
static struct cn10k_ml_error_db ml_stype_db_hw_nf[] = {
{ML_CN10K_FW_ERR_NOERR, "NO ERROR"},
{ML_CN10K_FW_ERR_UNLOAD_ID_NOT_FOUND, "UNLOAD MODEL ID NOT FOUND"},
{ML_CN10K_FW_ERR_LOAD_LUT_OVERFLOW, "LOAD LUT OVERFLOW"},
Expand All @@ -38,7 +38,7 @@ static struct cnxk_ml_error_db ml_stype_db_hw_nf[] = {
};

/* Driver error subtype database */
static struct cnxk_ml_error_db ml_stype_db_driver[] = {
static struct cn10k_ml_error_db ml_stype_db_driver[] = {
{ML_CN10K_DRIVER_ERR_NOERR, "NO ERROR"},
{ML_CN10K_DRIVER_ERR_UNKNOWN, "UNKNOWN ERROR"},
{ML_CN10K_DRIVER_ERR_EXCEPTION, "FW EXCEPTION"},
Expand Down Expand Up @@ -784,6 +784,7 @@ cn10k_ml_model_load(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_model_params *
model->result_update = cn10k_ml_result_update;
model->set_error_code = cn10k_ml_set_error_code;
model->set_poll_addr = cn10k_ml_set_poll_addr;
model->op_error_get = cn10k_ml_op_error_get;

return 0;
}
Expand Down Expand Up @@ -1257,7 +1258,7 @@ cn10k_ml_result_update(struct cnxk_ml_dev *cnxk_mldev, int qp_id, void *request)

/* Handle driver error */
error_code = (union cn10k_ml_error_code *)&result->error_code;
if (error_code->s.etype == ML_CNXK_ETYPE_DRIVER) {
if (error_code->s.etype == ML_CN10K_ETYPE_DRIVER) {
cn10k_mldev = &cnxk_mldev->cn10k_mldev;

/* Check for exception */
Expand Down Expand Up @@ -1310,7 +1311,7 @@ cn10k_ml_enqueue_single(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op, ui

memset(&req->cn10k_req.result, 0, sizeof(struct cn10k_ml_result));
error_code = (union cn10k_ml_error_code *)&req->cn10k_req.result.error_code;
error_code->s.etype = ML_CNXK_ETYPE_UNKNOWN;
error_code->s.etype = ML_CN10K_ETYPE_UNKNOWN;
req->cn10k_req.result.user_ptr = op->user_ptr;

cnxk_ml_set_poll_ptr(req);
Expand All @@ -1324,24 +1325,25 @@ cn10k_ml_enqueue_single(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op, ui
}

__rte_hot int
cn10k_ml_op_error_get(struct rte_ml_dev *dev, struct rte_ml_op *op, struct rte_ml_op_error *error)
cn10k_ml_op_error_get(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op,
struct rte_ml_op_error *error)
{
union cn10k_ml_error_code *error_code;

PLT_SET_USED(dev);
PLT_SET_USED(cnxk_mldev);

error_code = (union cn10k_ml_error_code *)&op->impl_opaque;

/* Copy sub error message */
if (error_code->s.etype == ML_CNXK_ETYPE_HW_NONFATAL) {
if (error_code->s.etype == ML_CN10K_ETYPE_HW_NONFATAL) {
if (error_code->s.stype < PLT_DIM(ml_stype_db_hw_nf))
snprintf(error->message, RTE_ML_STR_MAX, "%s : %s",
ml_etype_db[error_code->s.etype].str,
ml_stype_db_hw_nf[error_code->s.stype].str);
else
snprintf(error->message, RTE_ML_STR_MAX, "%s : UNKNOWN ERROR",
ml_etype_db[error_code->s.etype].str);
} else if (error_code->s.etype == ML_CNXK_ETYPE_DRIVER) {
} else if (error_code->s.etype == ML_CN10K_ETYPE_DRIVER) {
snprintf(error->message, RTE_ML_STR_MAX, "%s : %s",
ml_etype_db[error_code->s.etype].str,
ml_stype_db_driver[error_code->s.stype].str);
Expand Down Expand Up @@ -1387,7 +1389,7 @@ cn10k_ml_inference_sync(void *device, uint16_t index, void *input, void *output,

memset(&req->cn10k_req.result, 0, sizeof(struct cn10k_ml_result));
error_code = (union cn10k_ml_error_code *)&req->cn10k_req.result.error_code;
error_code->s.etype = ML_CNXK_ETYPE_UNKNOWN;
error_code->s.etype = ML_CN10K_ETYPE_UNKNOWN;
req->cn10k_req.result.user_ptr = NULL;

cnxk_ml_set_poll_ptr(req);
Expand Down
2 changes: 1 addition & 1 deletion drivers/ml/cnxk/cn10k_ml_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ int cn10k_ml_model_params_update(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_
/* Fast-path ops */
__rte_hot bool cn10k_ml_enqueue_single(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op,
uint16_t layer_id, struct cnxk_ml_qp *qp, uint64_t head);
__rte_hot int cn10k_ml_op_error_get(struct rte_ml_dev *dev, struct rte_ml_op *op,
__rte_hot int cn10k_ml_op_error_get(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op,
struct rte_ml_op_error *error);
__rte_hot int cn10k_ml_inference_sync(void *device, uint16_t index, void *input, void *output,
uint16_t nb_batches);
Expand Down
8 changes: 0 additions & 8 deletions drivers/ml/cnxk/cnxk_ml_dev.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,3 @@ int cnxk_ml_dev_initialized;

/* Dummy operations for ML device */
struct rte_ml_dev_ops ml_dev_dummy_ops = {0};

/* Error type database */
struct cnxk_ml_error_db ml_etype_db[] = {
{ML_CNXK_ETYPE_NO_ERROR, "NO_ERROR"}, {ML_CNXK_ETYPE_FW_NONFATAL, "FW_NON_FATAL"},
{ML_CNXK_ETYPE_HW_NONFATAL, "HW_NON_FATAL"}, {ML_CNXK_ETYPE_HW_FATAL, "HW_FATAL"},
{ML_CNXK_ETYPE_HW_WARNING, "HW_WARNING"}, {ML_CNXK_ETYPE_DRIVER, "DRIVER_ERROR"},
{ML_CNXK_ETYPE_UNKNOWN, "UNKNOWN_ERROR"},
};
18 changes: 1 addition & 17 deletions drivers/ml/cnxk/cnxk_ml_dev.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,22 +22,6 @@
#define ML_CNXK_POLL_JOB_START 0
#define ML_CNXK_POLL_JOB_FINISH 1

/* Error types enumeration */
enum cnxk_ml_error_etype {
/* 0x0 */ ML_CNXK_ETYPE_NO_ERROR = 0, /* No error */
/* 0x1 */ ML_CNXK_ETYPE_FW_NONFATAL, /* Firmware non-fatal error */
/* 0x2 */ ML_CNXK_ETYPE_HW_NONFATAL, /* Hardware non-fatal error */
/* 0x3 */ ML_CNXK_ETYPE_HW_FATAL, /* Hardware fatal error */
/* 0x4 */ ML_CNXK_ETYPE_HW_WARNING, /* Hardware warning */
/* 0x5 */ ML_CNXK_ETYPE_DRIVER, /* Driver specific error */
/* 0x6 */ ML_CNXK_ETYPE_UNKNOWN, /* Unknown error */
};

struct cnxk_ml_error_db {
uint64_t code;
char str[RTE_ML_STR_MAX];
};

/* Device type */
enum cnxk_ml_dev_type {
/* PCI based Marvell's ML HW accelerator device */
Expand Down Expand Up @@ -115,6 +99,6 @@ struct cnxk_ml_dev {
struct cnxk_ml_index_map *index_map;
};

extern struct cnxk_ml_error_db ml_etype_db[];
extern struct cn10k_ml_error_db ml_etype_db[];

#endif /* _CNXK_ML_DEV_H_ */
3 changes: 3 additions & 0 deletions drivers/ml/cnxk/cnxk_ml_model.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ typedef bool (*enqueue_single_t)(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_o
typedef void (*result_update_t)(struct cnxk_ml_dev *cnxk_mldev, int qp_id, void *request);
typedef void (*set_error_code_t)(struct cnxk_ml_req *req, uint64_t etype, uint64_t stype);
typedef void (*set_poll_addr_t)(struct cnxk_ml_req *req);
typedef int (*op_error_get_t)(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op,
struct rte_ml_op_error *error);

/* Model Object */
struct cnxk_ml_model {
Expand Down Expand Up @@ -184,6 +186,7 @@ struct cnxk_ml_model {
result_update_t result_update;
set_error_code_t set_error_code;
set_poll_addr_t set_poll_addr;
op_error_get_t op_error_get;
};

enum cnxk_ml_model_type cnxk_ml_model_get_type(struct rte_ml_model_params *params);
Expand Down
18 changes: 14 additions & 4 deletions drivers/ml/cnxk/cnxk_ml_ops.c
Original file line number Diff line number Diff line change
Expand Up @@ -647,9 +647,7 @@ cnxk_ml_dev_configure(struct rte_ml_dev *dev, const struct rte_ml_dev_config *co

cnxk_mldev->mldev->enqueue_burst = cnxk_ml_enqueue_burst;
cnxk_mldev->mldev->dequeue_burst = cnxk_ml_dequeue_burst;

if (cnxk_mldev->type == CNXK_ML_DEV_TYPE_PCI)
cnxk_mldev->mldev->op_error_get = cn10k_ml_op_error_get;
cnxk_mldev->mldev->op_error_get = cnxk_ml_op_error_get;

/* Allocate and initialize index_map */
if (cnxk_mldev->index_map == NULL) {
Expand Down Expand Up @@ -1640,7 +1638,7 @@ cnxk_ml_dequeue_burst(struct rte_ml_dev *dev, uint16_t qp_id, struct rte_ml_op *
if (plt_tsc_cycles() < req->timeout)
goto empty_or_active;
else /* Timeout, set indication of driver error */
model->set_error_code(req, ML_CNXK_ETYPE_DRIVER, 0);
model->set_error_code(req, ML_CN10K_ETYPE_DRIVER, 0);
}

model->result_update(cnxk_mldev, qp->id, req);
Expand All @@ -1658,6 +1656,18 @@ cnxk_ml_dequeue_burst(struct rte_ml_dev *dev, uint16_t qp_id, struct rte_ml_op *
return count;
}

__rte_hot int
cnxk_ml_op_error_get(struct rte_ml_dev *dev, struct rte_ml_op *op, struct rte_ml_op_error *error)
{
struct cnxk_ml_dev *cnxk_mldev;
struct cnxk_ml_model *model;

cnxk_mldev = dev->data->dev_private;
model = cnxk_mldev->mldev->data->models[op->model_id];

return model->op_error_get(cnxk_mldev, op, error);
}

struct rte_ml_dev_ops cnxk_ml_ops = {
/* Device control ops */
.dev_info_get = cnxk_ml_dev_info_get,
Expand Down
2 changes: 2 additions & 0 deletions drivers/ml/cnxk/cnxk_ml_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,5 +83,7 @@ __rte_hot uint16_t cnxk_ml_dequeue_burst(struct rte_ml_dev *dev, uint16_t qp_id,
struct rte_ml_op **ops, uint16_t nb_ops);
__rte_hot void cnxk_ml_set_poll_ptr(struct cnxk_ml_req *req);
__rte_hot uint64_t cnxk_ml_get_poll_ptr(struct cnxk_ml_req *req);
__rte_hot int cnxk_ml_op_error_get(struct rte_ml_dev *dev, struct rte_ml_op *op,
struct rte_ml_op_error *error);

#endif /* _CNXK_ML_OPS_H_ */
13 changes: 13 additions & 0 deletions drivers/ml/cnxk/mvtvm_ml_ops.c
Original file line number Diff line number Diff line change
Expand Up @@ -329,11 +329,13 @@ mvtvm_ml_model_load(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_model_params *
model->result_update = cn10k_ml_result_update;
model->set_error_code = cn10k_ml_set_error_code;
model->set_poll_addr = cn10k_ml_set_poll_addr;
model->op_error_get = cn10k_ml_op_error_get;
} else {
model->enqueue_single = mvtvm_ml_enqueue_single;
model->result_update = mvtvm_ml_result_update;
model->set_error_code = mvtvm_ml_set_error_code;
model->set_poll_addr = mvtvm_ml_set_poll_addr;
model->op_error_get = mvtvm_ml_op_error_get;
}

return 0;
Expand Down Expand Up @@ -584,6 +586,17 @@ mvtvm_ml_set_error_code(struct cnxk_ml_req *req, uint64_t etype, uint64_t stype)
req->mvtvm_req.result.error_code = etype;
}

__rte_hot int
mvtvm_ml_op_error_get(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op,
struct rte_ml_op_error *error)
{
RTE_SET_USED(cnxk_mldev);
RTE_SET_USED(op);
RTE_SET_USED(error);

return 0;
}

__rte_hot bool
mvtvm_ml_enqueue_single(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op, uint16_t layer_id,
struct cnxk_ml_qp *qp, uint64_t head)
Expand Down
2 changes: 2 additions & 0 deletions drivers/ml/cnxk/mvtvm_ml_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ int mvtvm_ml_io_dequantize(void *device, uint16_t model_id, const char *layer_na

__rte_hot bool mvtvm_ml_enqueue_single(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op,
uint16_t layer_id, struct cnxk_ml_qp *qp, uint64_t head);
__rte_hot int mvtvm_ml_op_error_get(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op,
struct rte_ml_op_error *error);
__rte_hot void mvtvm_ml_result_update(struct cnxk_ml_dev *cnxk_mldev, int qp_id, void *request);
__rte_hot void mvtvm_ml_set_error_code(struct cnxk_ml_req *req, uint64_t etype, uint64_t stype);

Expand Down

0 comments on commit 986755f

Please sign in to comment.