From b4793785ed9de0723cda7051bc881799629568ff Mon Sep 17 00:00:00 2001 From: Joseph Handzik Date: Fri, 9 Apr 2021 14:07:38 -0500 Subject: [PATCH] Add support for 125KB IO workload Signed-Off-By: Joe Handzik --- .gitlab-ci.yml | 2 +- bobber/bobber.py | 23 +++++--- bobber/lib/analysis/aggregate_results.py | 73 ++++++++++++++++++++++++ bobber/lib/analysis/parse_results.py | 50 +++++++++++++++- bobber/lib/analysis/table.py | 35 ++++++++++++ bobber/lib/constants.py | 4 ++ bobber/lib/tests/run_tests.py | 48 ++++++++++++++++ 7 files changed, 225 insertions(+), 10 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 57070be..db9a8a3 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -41,5 +41,5 @@ test: - python setup.py bdist_wheel sdist - pip install dist/nvidia_bobber-*-none-any.whl - bobber cast /raid - - bobber run-all --ssh-iface enp2s0f0 --iterations 2 --batch-size-sm 512 --batch-size-lg 256 --gpus 4 --bw-threads 16 --iops-threads 200 test_results localhost + - bobber run-all --ssh-iface enp2s0f0 --iterations 2 --batch-size-sm 512 --batch-size-lg 256 --gpus 4 --bw-threads 16 --125k-threads 32 --iops-threads 96 test_results localhost - bobber parse-results --compare-baseline single-dgx-station-baseline test_results/ diff --git a/bobber/bobber.py b/bobber/bobber.py index 367901f..69dd784 100644 --- a/bobber/bobber.py +++ b/bobber/bobber.py @@ -19,6 +19,7 @@ RUN_NCCL, RUN_STG_BW, RUN_STG_IOPS, + RUN_STG_125K, RUN_STG_META, SYSTEMS ) @@ -129,6 +130,9 @@ def parse_args(version: str) -> Namespace: commands_parent.add_argument('--bw-threads', help='Maximum number of ' 'threads to use for bandwidth tests', type=int) + commands_parent.add_argument('--125k-threads', dest='stg_125k_threads', + help='Maximum number of threads to use for ' + '125K IO size tests', type=int) commands_parent.add_argument('--iops-threads', help='Maximum number of ' 'threads to use for iops tests', type=int) commands_parent.add_argument('--iterations', help='Number of iterations to' @@ -143,11 +147,12 @@ def parse_args(version: str) -> Namespace: 'would result in tests for 1, 2, and 3 ' 'systems)', action='store_true') commands_parent.add_argument('--system', help='If system is specified, ' - 'iops-threads, bw-threads, gpus, batch size, ' - 'and network interface names are given ' - 'default values - override by specifying the ' - 'flags you\'d prefer to override, ignore the ' - 'flags you are ok with using defaults for ' + 'iops-threads, 125k-threads, bw-threads, ' + 'gpus, batch size, and network interface ' + 'names are given default values - override ' + 'by specifying the flags you\'d prefer to ' + 'override, ignore the flags you are ok with ' + 'using defaults for ' 'supported systems: dgx-a100-single, ' 'dgx-a100-dual, and dgx-2 for now. -single ' 'is used for a system with a single storage ' @@ -170,11 +175,13 @@ def parse_args(version: str) -> Namespace: parents=[commands_parent]) commands.add_parser(RUN_NCCL, help='Run NCCL tests only', parents=[commands_parent]) - commands.add_parser(RUN_STG_BW, help='Run storage bandwdith tests only', + commands.add_parser(RUN_STG_BW, help='Run storage bandwidth test only', parents=[commands_parent]) - commands.add_parser(RUN_STG_IOPS, help='Run storage IOPS tests only', + commands.add_parser(RUN_STG_125K, help='Run storage 125 IO size test only', parents=[commands_parent]) - commands.add_parser(RUN_STG_META, help='Run storage metadata tests only', + commands.add_parser(RUN_STG_IOPS, help='Run storage IOPS test only', + parents=[commands_parent]) + commands.add_parser(RUN_STG_META, help='Run storage metadata test only', parents=[commands_parent]) # Options specific to exporting the containers diff --git a/bobber/lib/analysis/aggregate_results.py b/bobber/lib/analysis/aggregate_results.py index 2f5ee7f..61be50f 100644 --- a/bobber/lib/analysis/aggregate_results.py +++ b/bobber/lib/analysis/aggregate_results.py @@ -70,6 +70,18 @@ class AggregateResults: write_iops_params : dict A ``dictionary`` of the parameters used during the fio write iops tests. + read_125k_bw : dict + A ``dictionary`` containing all of the fio 125k read bandwidth results + for N-systems. + write_125k_bw : dict + A ``dictionary`` containing all of the fio 125k write bandwidth results + for N-systems. + read_125k_bw_params : dict + A ``dictionary`` of the parameters used during the fio 125k read + bandwidth tests. + write_125k_bw_params : dict + A ``dictionary`` of the parameters used during the fio 125k write + bandwidth tests. max_bw : dict A ``dictionary`` of the maximum bus bandwidth achieved from NCCL tests. bytes_sizes : dict @@ -93,6 +105,10 @@ def __init__(self, write_iops: dict, read_iops_params: dict, write_iops_params: dict, + read_125k_bw: dict, + write_125k_bw: dict, + read_125k_bw_params: dict, + write_125k_bw_params: dict, max_bw: dict, bytes_sizes: dict, dali_results: dict, @@ -102,10 +118,14 @@ def __init__(self, self._read_bw_params = read_bw_params self._read_iops = read_iops self._read_iops_params = read_iops_params + self._125k_read_bw = read_125k_bw + self._125k_read_bw_params = read_125k_bw_params self._write_bw = write_bw self._write_bw_params = write_bw_params self._write_iops = write_iops self._write_iops_params = write_iops_params + self._125k_write_bw = write_125k_bw + self._125k_write_bw_params = write_125k_bw_params self._max_bw = max_bw self._bytes_sizes = bytes_sizes self._dali_results = dali_results @@ -124,6 +144,8 @@ def __str__(self) -> str: Aggregate Write Bandwidth: 1.232 GB/s Aggregate Read IOPS: 136.5 k IOPS Aggregate Write IOPS: 135.0 k IOPS + Aggregate 125k Read Bandwidth: 1.595 GB/s + Aggregate 125k Write Bandwidth: 1.232 GB/s NCCL Max Bus Bandwidth: 79.865 at 512.0 MB Mdtest Directory creation: 71406.29550000001 ops @@ -159,6 +181,10 @@ def __str__(self) -> str: ['Systems tested:', self._num_systems, ''], ['Aggregate Read Bandwidth:', self.average_read_bw, ' GB/s'], ['Aggregate Write Bandwidth:', self.average_write_bw, ' GB/s'], + ['Aggregate 125k Read Bandwidth:', self.average_125k_read_bw, + ' GB/s'], + ['Aggregate 125k Write Bandwidth:', self.average_125k_write_bw, + ' GB/s'], ['Aggregate Read IOPS:', self.average_read_iops, 'k IOPS'], ['Aggregate Write IOPS:', self.average_write_iops, 'k IOPS'], ] @@ -275,6 +301,15 @@ def json(self) -> dict: 'write': self._write_iops_params } }, + '125k_bandwidth': { + 'read': self._average_125k_read_bw(), + 'write': self._average_125k_write_bw(), + 'unit': 'operations/second', + 'parameters': { + 'read': self._125k_read_bw_params, + 'write': self._125k_write_bw_params + } + }, 'nccl': { 'max_bus_bw': self.max_bus_bandwidth, 'max_bus_bytes': self.max_bus_bytes, @@ -325,6 +360,44 @@ def average_write_bw(self) -> float: """ return round(self._average_write_bw() * 1e-9, 3) + @average_decorator + def _average_125k_read_bw(self) -> float: + """ + Returns the average 125k read bandwidth as a ``float`` for all + iterations in B/s. Defaults to 0.0. + """ + try: + return self._125k_read_bw[self._num_systems] + except KeyError: + return 0.0 + + @property + def average_125k_read_bw(self) -> float: + """ + Returns the average 125k read bandwidth as a ``float`` for all + iterations in GB/s, rounded to the nearest thousandth. + """ + return round(self._average_125k_read_bw() * 1e-9, 3) + + @average_decorator + def _average_125k_write_bw(self) -> float: + """ + Returns the average 125k write bandwidth as a ``float`` for all + iterations in B/s. Defaults to 0.0 + """ + try: + return self._125k_write_bw[self._num_systems] + except KeyError: + return 0.0 + + @property + def average_125k_write_bw(self) -> float: + """ + Returns the average 125k write bandwidth as a ``float`` for all + iterations in GB/s, rounded to the nearest thousandth. + """ + return round(self._average_125k_write_bw() * 1e-9, 3) + @average_decorator def _average_read_iops(self) -> float: """ diff --git a/bobber/lib/analysis/parse_results.py b/bobber/lib/analysis/parse_results.py index f67a5cc..21bf0a7 100644 --- a/bobber/lib/analysis/parse_results.py +++ b/bobber/lib/analysis/parse_results.py @@ -110,6 +110,42 @@ def parse_fio_iops(log_files: list) -> Tuple[dict, dict, dict, dict]: return read_sys_results, write_sys_results, read_params, write_params +def parse_fio_125k_bw(log_files: list) -> Tuple[dict, dict, dict, dict]: + """ + Parse all FIO 125k bandwidth logs. + + Find each FIO 125k bandwidth log in the results directory and parse the + read and write results and parameters from each log for all system counts. + + Parameters + ---------- + log_files : list + A ``list`` of ``strings`` of the paths to each log file in the results + directory. + + Returns + ------- + tuple + A ``tuple`` of four dictionaries containing the 125k read results, 125k + write results, 125k read parameters, and 125k write parameters for all + system counts. + """ + read_sys_results = defaultdict(list) + write_sys_results = defaultdict(list) + read_params, write_params = None, None + + fio_logs_by_systems = divide_logs_by_systems(log_files, + 'stg_125k_iteration') + + for systems, files in fio_logs_by_systems.items(): + read_sys_results, write_sys_results, read_params, write_params = \ + parse_fio_bw_file(files, + systems, + read_sys_results, + write_sys_results) + return read_sys_results, write_sys_results, read_params, write_params + + def parse_nccl(log_files: list) -> Tuple[dict, dict]: """ Parse all NCCL logs. @@ -251,6 +287,10 @@ def save_yaml_baseline(final_dictionary_output: dict, # FIO IOPS speed in ops/second read: {results.get('iops', {}).get('read', 0)} write: {results.get('iops', {}).get('write', 0)} + 125k_bandwidth: + # FIO 125k BW speed in bytes/second + read: {results.get('125k_bandwidth', {}).get('read', 0)} + write: {results.get('125k_bandwidth', {}).get('write', 0)} nccl: # NCCL maximum bus bandwidth in GB/s max_bus_bw: {results.get('nccl', {}).get('max_bus_bw', 0)} @@ -315,6 +355,9 @@ def main(directory: str, override_version_check) bw_results = parse_fio_bw(log_files) read_bw, write_bw, read_bw_params, write_bw_params = bw_results + bw_125k_results = parse_fio_125k_bw(log_files) + read_125k_bw, write_125k_bw, read_125k_bw_params, write_125k_bw_params = \ + bw_125k_results iops_results = parse_fio_iops(log_files) read_iops, write_iops, read_iops_params, write_iops_params = iops_results metadata = parse_meta(log_files) @@ -323,7 +366,8 @@ def main(directory: str, total_systems = 0 systems = [] - for result in [read_bw, read_iops, max_bw, dali_results, metadata]: + for result in [read_bw, read_iops, read_125k_bw, max_bw, dali_results, + metadata]: try: total_systems = max(result.keys()) systems = sorted(result.keys()) @@ -341,6 +385,10 @@ def main(directory: str, write_iops, read_iops_params, write_iops_params, + read_125k_bw, + write_125k_bw, + read_125k_bw_params, + write_125k_bw_params, max_bw, bytes_sizes, dali_results, diff --git a/bobber/lib/analysis/table.py b/bobber/lib/analysis/table.py index a53ce34..ab379d1 100644 --- a/bobber/lib/analysis/table.py +++ b/bobber/lib/analysis/table.py @@ -10,6 +10,8 @@ FIO_WRITE_BW = f'{bcolors.BOLD}FIO Write (GB/s) - 1MB BS{bcolors.ENDC}' FIO_READ_IOP = f'{bcolors.BOLD}FIO Read (k IOPS) - 4K BS{bcolors.ENDC}' FIO_WRITE_IOP = f'{bcolors.BOLD}FIO Write (k IOPS) - 4K BS{bcolors.ENDC}' +FIO_125K_READ_BW = f'{bcolors.BOLD}FIO Read (GB/s) - 125K BS{bcolors.ENDC}' +FIO_125K_WRITE_BW = f'{bcolors.BOLD}FIO Write (GB/s) - 125K BS{bcolors.ENDC}' NCCL = f'{bcolors.BOLD}NCCL Max BW (GB/s){bcolors.ENDC}' DALI_IMG_SM = (f'{bcolors.BOLD}DALI Standard 800x600 throughput ' f'(images/second){bcolors.ENDC}') @@ -146,6 +148,38 @@ def fio_iops(results: list) -> Tuple[list, list]: return [read, write] +def fio_125k_bw(results: list) -> Tuple[list, list]: + """ + Save the FIO 125k bandwidth read and write results. + + Save the read and write results from the FIO 125k bandwidth tests on an + increasing per-system basis with the first element in the list being the + column header. + + Parameters + ---------- + results : list + A ``list`` of ``dictionaries`` containing all results from the tests. + + Returns + ------- + tuple + Returns a ``tuple`` of (``list``, ``list``) containing the read and + write 125k bandwidth results, respectively. + """ + try: + read = [FIO_125K_READ_BW] + [bytes_to_gb(result[1]['125k_bandwidth'] + ['read']) + for result in results] + write = [FIO_125K_WRITE_BW] + [bytes_to_gb(result[1]['125k_bandwidth'] + ['write']) + for result in results] + except KeyError: + return [] + else: + return [read, write] + + def nccl(results: list) -> list: """ Save the NCCL results. @@ -288,6 +322,7 @@ def display_table(json_results: dict) -> NoReturn: data += fio_bw(results) data += fio_iops(results) + data += fio_125k_bw(results) data += nccl(results) data += dali(results) diff --git a/bobber/lib/constants.py b/bobber/lib/constants.py index 7786139..e9fdec4 100644 --- a/bobber/lib/constants.py +++ b/bobber/lib/constants.py @@ -9,11 +9,13 @@ RUN_NCCL = 'run-nccl' RUN_STG_BW = 'run-stg-bw' RUN_STG_IOPS = 'run-stg-iops' +RUN_STG_125K = 'run-stg-125k' RUN_STG_META = 'run-stg-meta' DGX_A100_SINGLE = { 'gpus': 8, 'bw_threads': 16, + 'stg_125k_threads': 16, 'iops_threads': 200, 'batch_size_sm': 512, 'batch_size_lg': 256, @@ -25,6 +27,7 @@ DGX_A100_DUAL = { 'gpus': 8, 'bw_threads': 16, + 'stg_125k_threads': 16, 'iops_threads': 200, 'batch_size_sm': 512, 'batch_size_lg': 256, @@ -36,6 +39,7 @@ DGX_2 = { 'gpus': 16, 'bw-threads': 16, + 'stg_125k_threads': 16, 'batch-size-sm': 150, 'batch-size-lg': 75, 'iops-threads': 80, diff --git a/bobber/lib/tests/run_tests.py b/bobber/lib/tests/run_tests.py index 9824bb4..be32e68 100644 --- a/bobber/lib/tests/run_tests.py +++ b/bobber/lib/tests/run_tests.py @@ -7,6 +7,7 @@ RUN_NCCL, RUN_STG_BW, RUN_STG_IOPS, + RUN_STG_125K, RUN_STG_META ) from bobber.lib.docker import manager @@ -100,6 +101,50 @@ def run_stg_bw(args: Namespace, bobber_version: str, iteration: int, sleep(args.pause) +def run_stg_125k(args: Namespace, bobber_version: str, iteration: int, + hosts: str) -> NoReturn: + """ + Run single or multi-node storage 125KB IO size tests with FIO. + + Run a single or multi-node storage bandwidth test with FIO which first + writes data to the filesystem with 125KB block size and 4GB file size, + followed by reading the data back. + + Parameters + ---------- + args : Namespace + A ``Namespace`` of all settings specified by the user for the test. + bobber_version : string + A ``string`` of the local version of Bobber, such as '5.0.0'. + iteration : int + An ``int`` of the local test number, starting at 1. + hosts : string + A comma-separated list of hostnames to test against, such as + 'host1,host2,host3,host4'. + """ + stg_125k_log = os.path.join(args.log_path, + f'stg_125k_iteration_{iteration}_' + f'threads_{args.stg_125k_threads}_' + f'direct_{args.direct}_' + f'depth_{args.io_depth}_' + f'systems_{len(hosts.split(","))}_' + f'version_{bobber_version}.log') + environment = { + 'EXTRA_FLAGS': args.stg_extra_flags, + 'IO_DEPTH': args.io_depth, + 'IOSIZE': 125, + 'DIRECTIO': args.direct, + 'THREADS': args.stg_125k_threads, + 'HOSTS': hosts + } + manager.execute('tests/fio_multi.sh', + environment=environment, + log_file=stg_125k_log) + + if args.pause > 0: + sleep(args.pause) + + def run_stg_iops(args: Namespace, bobber_version: str, iteration: int, hosts: str) -> NoReturn: """ @@ -253,6 +298,8 @@ def kickoff_test(args: Namespace, bobber_version: str, iteration: int, run_stg_bw(args, bobber_version, iteration, hosts) elif args.command == RUN_STG_IOPS: run_stg_iops(args, bobber_version, iteration, hosts) + elif args.command == RUN_STG_125K: + run_stg_125k(args, bobber_version, iteration, hosts) elif args.command == RUN_STG_META: run_stg_meta(args, bobber_version, iteration, hosts) elif args.command == RUN_ALL: @@ -261,6 +308,7 @@ def kickoff_test(args: Namespace, bobber_version: str, iteration: int, run_stg_bw(args, bobber_version, iteration, hosts) run_dali(args, bobber_version, iteration, hosts) run_stg_iops(args, bobber_version, iteration, hosts) + run_stg_125k(args, bobber_version, iteration, hosts) def test_selector(args: Namespace, bobber_version: str) -> NoReturn: