Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ROCm component refactoring #49

Merged
merged 26 commits into from
Oct 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
fd53f23
rocm: replace trailing Ptr in rocm functions with _p
gcongiu May 16, 2023
ba37013
rocm: rename hsa_agent_arr_t to device_table_t
gcongiu May 16, 2023
8bcd664
rocm: remove macros handling error management
gcongiu May 16, 2023
6207a4d
rocm: update returned error codes
gcongiu May 16, 2023
a4408ca
rocm: remove ROCM_PROF_ROCPROFILER guard
gcongiu Jul 12, 2023
4af4d8c
rocm: extract shared functionality
gcongiu May 17, 2023
46de36c
rocm: rename source files for better readability
gcongiu Jul 12, 2023
b023090
rocm: remove unneeded comments
gcongiu Jul 12, 2023
3c7dcbf
rocm: move extern declarations to config header
gcongiu Jul 12, 2023
333f169
rocm: extract all device booking and checking functions
gcongiu Jul 13, 2023
e4a6394
rocm: use snprintf instead of strncpy
gcongiu Jul 13, 2023
e7ab129
rocm: remove FIXME comment
gcongiu Jul 13, 2023
be382fa
rocm: reformat roc_profiler.c code
gcongiu Jul 13, 2023
245216b
rocm: rename rocp_config.h to roc_profiler_config.h
gcongiu Jul 13, 2023
8bfc358
rocm: rename evt_get_descr to evt_code_to_descr
gcongiu Jul 14, 2023
8c149d0
rocm: remove leftover err_get_last function header
gcongiu Jul 14, 2023
bee21ae
rocm: move agent to id function to roc_common
gcongiu Jul 17, 2023
6747038
rocm: remove roc_common.h from roc_profiler.h
gcongiu Jul 17, 2023
e5c2a80
rocm: fix warning in roc_common.c
gcongiu Jul 17, 2023
1bade8c
rocm: move thread id get function to roc_common
gcongiu Jul 18, 2023
0740057
rocm: fix warning in callback function
gcongiu Jul 20, 2023
f32ad7d
rocm: refactor rocc_dev_get_{count,id} functions
gcongiu Jul 20, 2023
a99acfa
rocm: funnel exits through same point in compomnent frontend
gcongiu Sep 8, 2023
8ad6068
rocm: add logging to component frontend
gcongiu Sep 8, 2023
33702cc
rocm: add logging to component backend
gcongiu Sep 8, 2023
60d30cc
rocm: fix typo in ctx_open
gcongiu Oct 9, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions src/components/rocm/Rules.rocm
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
PAPI_ROCM_ROOT ?= /opt/rocm

COMPSRCS += components/rocm/rocm.c \
components/rocm/rocp.c \
components/rocm/rocd.c
components/rocm/roc_profiler.c \
components/rocm/roc_dispatch.c \
components/rocm/roc_common.c

COMPOBJS += rocm.o rocp.o rocd.o
COMPOBJS += rocm.o roc_profiler.o roc_dispatch.o roc_common.o

CFLAGS += -I$(PAPI_ROCM_ROOT)/include \
-I$(PAPI_ROCM_ROOT)/include/hsa \
Expand All @@ -18,8 +19,11 @@ LDFLAGS += $(LDL)
rocm.o: components/rocm/rocm.c $(HEADERS)
$(CC) $(LIBCFLAGS) $(OPTFLAGS) -c $< -o $@

rocp.o: components/rocm/rocp.c $(HEADERS)
roc_profiler.o: components/rocm/roc_profiler.c $(HEADERS)
$(CC) $(LIBCFLAGS) $(OPTFLAGS) -c $< -o $@

rocd.o: components/rocm/rocd.c $(HEADERS)
roc_dispatch.o: components/rocm/roc_dispatch.c $(HEADERS)
$(CC) $(LIBCFLAGS) $(OPTFLAGS) -c $< -o $@

roc_common.o: components/rocm/roc_common.c $(HEADERS)
$(CC) $(LIBCFLAGS) $(OPTFLAGS) -c $< -o $@
27 changes: 0 additions & 27 deletions src/components/rocm/common.h

This file was deleted.

306 changes: 306 additions & 0 deletions src/components/rocm/roc_common.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,306 @@
#include <dlfcn.h>
#include <string.h>
#include "papi.h"
#include "papi_memory.h"
#include "roc_common.h"

/* hsa function pointers */
hsa_status_t (*hsa_init_p)(void);
hsa_status_t (*hsa_shut_down_p)(void);
hsa_status_t (*hsa_iterate_agents_p)(hsa_status_t (*)(hsa_agent_t, void *), void *);
hsa_status_t (*hsa_system_get_info_p)(hsa_system_info_t, void *);
hsa_status_t (*hsa_agent_get_info_p)(hsa_agent_t, hsa_agent_info_t, void *);
hsa_status_t (*hsa_queue_destroy_p)(hsa_queue_t *);
hsa_status_t (*hsa_status_string_p)(hsa_status_t, const char **);

static void *hsa_dlp;
char error_string[PAPI_MAX_STR_LEN];
static device_table_t device_table;
device_table_t *device_table_p;
static rocc_bitmap_t global_device_map;

static int load_hsa_sym(void);
static int unload_hsa_sym(void);
static int init_device_table(void);
static void init_thread_id_fn(void);
static unsigned long (*thread_id_fn)(void);

int
rocc_init(void)
{
int papi_errno = load_hsa_sym();
if (papi_errno != PAPI_OK) {
goto fn_fail;
}

hsa_status_t status = hsa_init_p();
if (status != HSA_STATUS_SUCCESS) {
papi_errno = PAPI_EMISC;
goto fn_fail;
}

papi_errno = init_device_table();
if (papi_errno != PAPI_OK) {
(*hsa_shut_down_p)();
goto fn_fail;
}

device_table_p = &device_table;
init_thread_id_fn();

fn_exit:
return papi_errno;
fn_fail:
unload_hsa_sym();
goto fn_exit;
}

int
rocc_shutdown(void)
{
hsa_shut_down_p();
unload_hsa_sym();
return PAPI_OK;
}

int
rocc_err_get_last(const char **err_string)
{
*err_string = error_string;
return PAPI_OK;
}

int
rocc_dev_get_map(rocc_dev_get_map_cb query_dev_id, unsigned int *events_id, int num_events, rocc_bitmap_t *bitmap)
{
int i;
rocc_bitmap_t device_map_acq = 0;

for (i = 0; i < num_events; ++i) {
unsigned int dev_id;
if (query_dev_id(events_id[i], &dev_id)) {
return PAPI_EMISC;
}

device_map_acq |= (1 << dev_id);
}

*bitmap = device_map_acq;
return PAPI_OK;
}

int
rocc_dev_acquire(rocc_bitmap_t bitmap)
{
rocc_bitmap_t device_map_acq = bitmap;

if (device_map_acq & global_device_map) {
return PAPI_EINVAL;
}
global_device_map |= device_map_acq;

return PAPI_OK;
}

int
rocc_dev_release(rocc_bitmap_t bitmap)
{
rocc_bitmap_t device_map_rel = bitmap;

if ((device_map_rel & global_device_map) != device_map_rel) {
return PAPI_EINVAL;
}
global_device_map &= ~device_map_rel;

return PAPI_OK;
}

static int dev_get_count(rocc_bitmap_t bitmap, int *num_devices);

int
rocc_dev_get_count(rocc_bitmap_t bitmap, int *num_devices)
{
return dev_get_count(bitmap, num_devices);
}

int
dev_get_count(rocc_bitmap_t bitmap, int *num_devices)
{
*num_devices = 0;

while (bitmap) {
bitmap -= bitmap & (~bitmap + 1);
++(*num_devices);
}

return PAPI_OK;
}

int
rocc_dev_get_id(rocc_bitmap_t bitmap, int dev_count, int *device_id)
{
int count = 0;

dev_get_count(bitmap, &count);
if (dev_count >= count) {
return PAPI_EMISC;
}

count = 0;
rocc_bitmap_t lsb = 0;
while (bitmap) {
lsb = bitmap & (~bitmap + 1);
bitmap -= lsb;
if (count++ == dev_count) {
break;
}
}

*device_id = 0;
while (!(lsb & 0x1)) {
++(*device_id);
lsb >>= 1;
}

return PAPI_OK;
}

int
rocc_dev_get_agent_id(hsa_agent_t agent, unsigned int *dev_id)
{
for (*dev_id = 0; *dev_id < (unsigned int) device_table_p->count; ++(*dev_id)) {
if (memcmp(&device_table_p->devices[*dev_id], &agent, sizeof(agent)) == 0) {
break;
}
}
return PAPI_OK;
}

int
rocc_thread_get_id(unsigned long *tid)
{
*tid = thread_id_fn();
return PAPI_OK;
}

int
load_hsa_sym(void)
{
int papi_errno = PAPI_OK;

char pathname[PATH_MAX] = { 0 };
char *rocm_root = getenv("PAPI_ROCM_ROOT");
if (rocm_root == NULL) {
snprintf(error_string, PAPI_MAX_STR_LEN, "Can't load libhsa-runtime64.so, PAPI_ROCM_ROOT not set.");
goto fn_fail;
}

sprintf(pathname, "%s/lib/libhsa-runtime64.so", rocm_root);

hsa_dlp = dlopen(pathname, RTLD_NOW | RTLD_GLOBAL);
if (hsa_dlp == NULL) {
snprintf(error_string, PAPI_MAX_STR_LEN, "%s", dlerror());
goto fn_fail;
}

hsa_init_p = dlsym(hsa_dlp, "hsa_init");
hsa_shut_down_p = dlsym(hsa_dlp, "hsa_shut_down");
hsa_iterate_agents_p = dlsym(hsa_dlp, "hsa_iterate_agents");
hsa_system_get_info_p = dlsym(hsa_dlp, "hsa_system_get_info");
hsa_agent_get_info_p = dlsym(hsa_dlp, "hsa_agent_get_info");
hsa_queue_destroy_p = dlsym(hsa_dlp, "hsa_queue_destroy");
hsa_status_string_p = dlsym(hsa_dlp, "hsa_status_string");

int hsa_not_initialized = (!hsa_init_p ||
!hsa_shut_down_p ||
!hsa_iterate_agents_p ||
!hsa_system_get_info_p ||
!hsa_agent_get_info_p ||
!hsa_queue_destroy_p ||
!hsa_status_string_p);

papi_errno = (hsa_not_initialized) ? PAPI_EMISC : PAPI_OK;
if (papi_errno != PAPI_OK) {
snprintf(error_string, PAPI_MAX_STR_LEN, "Error while loading hsa symbols.");
}

fn_exit:
return papi_errno;
fn_fail:
papi_errno = PAPI_ENOSUPP;
goto fn_exit;
}

int
unload_hsa_sym(void)
{
if (hsa_dlp == NULL) {
return PAPI_OK;
}

hsa_init_p = NULL;
hsa_shut_down_p = NULL;
hsa_iterate_agents_p = NULL;
hsa_system_get_info_p = NULL;
hsa_agent_get_info_p = NULL;
hsa_queue_destroy_p = NULL;
hsa_status_string_p = NULL;

dlclose(hsa_dlp);

return PAPI_OK;
}

static hsa_status_t get_agent_handle_cb(hsa_agent_t, void *);

int
init_device_table(void)
{
int papi_errno = PAPI_OK;

hsa_status_t hsa_errno = hsa_iterate_agents_p(get_agent_handle_cb, &device_table);
if (hsa_errno != HSA_STATUS_SUCCESS) {
const char *error_string_p;
hsa_status_string_p(hsa_errno, &error_string_p);
snprintf(error_string, PAPI_MAX_STR_LEN, "%s", error_string_p);
goto fn_fail;
}

fn_exit:
return papi_errno;
fn_fail:
papi_errno = PAPI_EMISC;
device_table.count = 0;
goto fn_exit;
}

hsa_status_t
get_agent_handle_cb(hsa_agent_t agent, void *device_table)
{
hsa_device_type_t type;
device_table_t *device_table_ = (device_table_t *) device_table;

hsa_status_t hsa_errno = hsa_agent_get_info_p(agent, HSA_AGENT_INFO_DEVICE, &type);
if (hsa_errno != HSA_STATUS_SUCCESS) {
return hsa_errno;
}

if (type == HSA_DEVICE_TYPE_GPU) {
assert(device_table_->count < PAPI_ROCM_MAX_DEV_COUNT);
device_table_->devices[device_table_->count] = agent;
++device_table_->count;
}

return HSA_STATUS_SUCCESS;
}

void
init_thread_id_fn(void)
{
if (thread_id_fn) {
return;
}

thread_id_fn = (_papi_hwi_thread_id_fn) ?
_papi_hwi_thread_id_fn : _papi_getpid;
}
Loading
Loading