Skip to content

Commit

Permalink
Merge pull request #73 from NVIDIA/wip-fio-support-125K-io-size
Browse files Browse the repository at this point in the history
Add support for 125KB IO workload
  • Loading branch information
joehandzik authored May 20, 2021
2 parents d21e2d3 + b479378 commit 2b2ceb8
Show file tree
Hide file tree
Showing 7 changed files with 225 additions and 10 deletions.
2 changes: 1 addition & 1 deletion .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,5 @@ test:
- python setup.py bdist_wheel sdist
- pip install dist/nvidia_bobber-*-none-any.whl
- bobber cast /raid
- bobber run-all --ssh-iface enp2s0f0 --iterations 2 --batch-size-sm 512 --batch-size-lg 256 --gpus 4 --bw-threads 16 --iops-threads 200 test_results localhost
- bobber run-all --ssh-iface enp2s0f0 --iterations 2 --batch-size-sm 512 --batch-size-lg 256 --gpus 4 --bw-threads 16 --125k-threads 32 --iops-threads 96 test_results localhost
- bobber parse-results --compare-baseline single-dgx-station-baseline test_results/
23 changes: 15 additions & 8 deletions bobber/bobber.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
RUN_NCCL,
RUN_STG_BW,
RUN_STG_IOPS,
RUN_STG_125K,
RUN_STG_META,
SYSTEMS
)
Expand Down Expand Up @@ -129,6 +130,9 @@ def parse_args(version: str) -> Namespace:
commands_parent.add_argument('--bw-threads', help='Maximum number of '
'threads to use for bandwidth tests',
type=int)
commands_parent.add_argument('--125k-threads', dest='stg_125k_threads',
help='Maximum number of threads to use for '
'125K IO size tests', type=int)
commands_parent.add_argument('--iops-threads', help='Maximum number of '
'threads to use for iops tests', type=int)
commands_parent.add_argument('--iterations', help='Number of iterations to'
Expand All @@ -143,11 +147,12 @@ def parse_args(version: str) -> Namespace:
'would result in tests for 1, 2, and 3 '
'systems)', action='store_true')
commands_parent.add_argument('--system', help='If system is specified, '
'iops-threads, bw-threads, gpus, batch size, '
'and network interface names are given '
'default values - override by specifying the '
'flags you\'d prefer to override, ignore the '
'flags you are ok with using defaults for '
'iops-threads, 125k-threads, bw-threads, '
'gpus, batch size, and network interface '
'names are given default values - override '
'by specifying the flags you\'d prefer to '
'override, ignore the flags you are ok with '
'using defaults for '
'supported systems: dgx-a100-single, '
'dgx-a100-dual, and dgx-2 for now. -single '
'is used for a system with a single storage '
Expand All @@ -170,11 +175,13 @@ def parse_args(version: str) -> Namespace:
parents=[commands_parent])
commands.add_parser(RUN_NCCL, help='Run NCCL tests only',
parents=[commands_parent])
commands.add_parser(RUN_STG_BW, help='Run storage bandwdith tests only',
commands.add_parser(RUN_STG_BW, help='Run storage bandwidth test only',
parents=[commands_parent])
commands.add_parser(RUN_STG_IOPS, help='Run storage IOPS tests only',
commands.add_parser(RUN_STG_125K, help='Run storage 125 IO size test only',
parents=[commands_parent])
commands.add_parser(RUN_STG_META, help='Run storage metadata tests only',
commands.add_parser(RUN_STG_IOPS, help='Run storage IOPS test only',
parents=[commands_parent])
commands.add_parser(RUN_STG_META, help='Run storage metadata test only',
parents=[commands_parent])

# Options specific to exporting the containers
Expand Down
73 changes: 73 additions & 0 deletions bobber/lib/analysis/aggregate_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,18 @@ class AggregateResults:
write_iops_params : dict
A ``dictionary`` of the parameters used during the fio write iops
tests.
read_125k_bw : dict
A ``dictionary`` containing all of the fio 125k read bandwidth results
for N-systems.
write_125k_bw : dict
A ``dictionary`` containing all of the fio 125k write bandwidth results
for N-systems.
read_125k_bw_params : dict
A ``dictionary`` of the parameters used during the fio 125k read
bandwidth tests.
write_125k_bw_params : dict
A ``dictionary`` of the parameters used during the fio 125k write
bandwidth tests.
max_bw : dict
A ``dictionary`` of the maximum bus bandwidth achieved from NCCL tests.
bytes_sizes : dict
Expand All @@ -93,6 +105,10 @@ def __init__(self,
write_iops: dict,
read_iops_params: dict,
write_iops_params: dict,
read_125k_bw: dict,
write_125k_bw: dict,
read_125k_bw_params: dict,
write_125k_bw_params: dict,
max_bw: dict,
bytes_sizes: dict,
dali_results: dict,
Expand All @@ -102,10 +118,14 @@ def __init__(self,
self._read_bw_params = read_bw_params
self._read_iops = read_iops
self._read_iops_params = read_iops_params
self._125k_read_bw = read_125k_bw
self._125k_read_bw_params = read_125k_bw_params
self._write_bw = write_bw
self._write_bw_params = write_bw_params
self._write_iops = write_iops
self._write_iops_params = write_iops_params
self._125k_write_bw = write_125k_bw
self._125k_write_bw_params = write_125k_bw_params
self._max_bw = max_bw
self._bytes_sizes = bytes_sizes
self._dali_results = dali_results
Expand All @@ -124,6 +144,8 @@ def __str__(self) -> str:
Aggregate Write Bandwidth: 1.232 GB/s
Aggregate Read IOPS: 136.5 k IOPS
Aggregate Write IOPS: 135.0 k IOPS
Aggregate 125k Read Bandwidth: 1.595 GB/s
Aggregate 125k Write Bandwidth: 1.232 GB/s
NCCL Max Bus Bandwidth: 79.865 at 512.0 MB
Mdtest
Directory creation: 71406.29550000001 ops
Expand Down Expand Up @@ -159,6 +181,10 @@ def __str__(self) -> str:
['Systems tested:', self._num_systems, ''],
['Aggregate Read Bandwidth:', self.average_read_bw, ' GB/s'],
['Aggregate Write Bandwidth:', self.average_write_bw, ' GB/s'],
['Aggregate 125k Read Bandwidth:', self.average_125k_read_bw,
' GB/s'],
['Aggregate 125k Write Bandwidth:', self.average_125k_write_bw,
' GB/s'],
['Aggregate Read IOPS:', self.average_read_iops, 'k IOPS'],
['Aggregate Write IOPS:', self.average_write_iops, 'k IOPS'],
]
Expand Down Expand Up @@ -275,6 +301,15 @@ def json(self) -> dict:
'write': self._write_iops_params
}
},
'125k_bandwidth': {
'read': self._average_125k_read_bw(),
'write': self._average_125k_write_bw(),
'unit': 'operations/second',
'parameters': {
'read': self._125k_read_bw_params,
'write': self._125k_write_bw_params
}
},
'nccl': {
'max_bus_bw': self.max_bus_bandwidth,
'max_bus_bytes': self.max_bus_bytes,
Expand Down Expand Up @@ -325,6 +360,44 @@ def average_write_bw(self) -> float:
"""
return round(self._average_write_bw() * 1e-9, 3)

@average_decorator
def _average_125k_read_bw(self) -> float:
"""
Returns the average 125k read bandwidth as a ``float`` for all
iterations in B/s. Defaults to 0.0.
"""
try:
return self._125k_read_bw[self._num_systems]
except KeyError:
return 0.0

@property
def average_125k_read_bw(self) -> float:
"""
Returns the average 125k read bandwidth as a ``float`` for all
iterations in GB/s, rounded to the nearest thousandth.
"""
return round(self._average_125k_read_bw() * 1e-9, 3)

@average_decorator
def _average_125k_write_bw(self) -> float:
"""
Returns the average 125k write bandwidth as a ``float`` for all
iterations in B/s. Defaults to 0.0
"""
try:
return self._125k_write_bw[self._num_systems]
except KeyError:
return 0.0

@property
def average_125k_write_bw(self) -> float:
"""
Returns the average 125k write bandwidth as a ``float`` for all
iterations in GB/s, rounded to the nearest thousandth.
"""
return round(self._average_125k_write_bw() * 1e-9, 3)

@average_decorator
def _average_read_iops(self) -> float:
"""
Expand Down
50 changes: 49 additions & 1 deletion bobber/lib/analysis/parse_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,42 @@ def parse_fio_iops(log_files: list) -> Tuple[dict, dict, dict, dict]:
return read_sys_results, write_sys_results, read_params, write_params


def parse_fio_125k_bw(log_files: list) -> Tuple[dict, dict, dict, dict]:
"""
Parse all FIO 125k bandwidth logs.
Find each FIO 125k bandwidth log in the results directory and parse the
read and write results and parameters from each log for all system counts.
Parameters
----------
log_files : list
A ``list`` of ``strings`` of the paths to each log file in the results
directory.
Returns
-------
tuple
A ``tuple`` of four dictionaries containing the 125k read results, 125k
write results, 125k read parameters, and 125k write parameters for all
system counts.
"""
read_sys_results = defaultdict(list)
write_sys_results = defaultdict(list)
read_params, write_params = None, None

fio_logs_by_systems = divide_logs_by_systems(log_files,
'stg_125k_iteration')

for systems, files in fio_logs_by_systems.items():
read_sys_results, write_sys_results, read_params, write_params = \
parse_fio_bw_file(files,
systems,
read_sys_results,
write_sys_results)
return read_sys_results, write_sys_results, read_params, write_params


def parse_nccl(log_files: list) -> Tuple[dict, dict]:
"""
Parse all NCCL logs.
Expand Down Expand Up @@ -251,6 +287,10 @@ def save_yaml_baseline(final_dictionary_output: dict,
# FIO IOPS speed in ops/second
read: {results.get('iops', {}).get('read', 0)}
write: {results.get('iops', {}).get('write', 0)}
125k_bandwidth:
# FIO 125k BW speed in bytes/second
read: {results.get('125k_bandwidth', {}).get('read', 0)}
write: {results.get('125k_bandwidth', {}).get('write', 0)}
nccl:
# NCCL maximum bus bandwidth in GB/s
max_bus_bw: {results.get('nccl', {}).get('max_bus_bw', 0)}
Expand Down Expand Up @@ -315,6 +355,9 @@ def main(directory: str,
override_version_check)
bw_results = parse_fio_bw(log_files)
read_bw, write_bw, read_bw_params, write_bw_params = bw_results
bw_125k_results = parse_fio_125k_bw(log_files)
read_125k_bw, write_125k_bw, read_125k_bw_params, write_125k_bw_params = \
bw_125k_results
iops_results = parse_fio_iops(log_files)
read_iops, write_iops, read_iops_params, write_iops_params = iops_results
metadata = parse_meta(log_files)
Expand All @@ -323,7 +366,8 @@ def main(directory: str,
total_systems = 0
systems = []

for result in [read_bw, read_iops, max_bw, dali_results, metadata]:
for result in [read_bw, read_iops, read_125k_bw, max_bw, dali_results,
metadata]:
try:
total_systems = max(result.keys())
systems = sorted(result.keys())
Expand All @@ -341,6 +385,10 @@ def main(directory: str,
write_iops,
read_iops_params,
write_iops_params,
read_125k_bw,
write_125k_bw,
read_125k_bw_params,
write_125k_bw_params,
max_bw,
bytes_sizes,
dali_results,
Expand Down
35 changes: 35 additions & 0 deletions bobber/lib/analysis/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
FIO_WRITE_BW = f'{bcolors.BOLD}FIO Write (GB/s) - 1MB BS{bcolors.ENDC}'
FIO_READ_IOP = f'{bcolors.BOLD}FIO Read (k IOPS) - 4K BS{bcolors.ENDC}'
FIO_WRITE_IOP = f'{bcolors.BOLD}FIO Write (k IOPS) - 4K BS{bcolors.ENDC}'
FIO_125K_READ_BW = f'{bcolors.BOLD}FIO Read (GB/s) - 125K BS{bcolors.ENDC}'
FIO_125K_WRITE_BW = f'{bcolors.BOLD}FIO Write (GB/s) - 125K BS{bcolors.ENDC}'
NCCL = f'{bcolors.BOLD}NCCL Max BW (GB/s){bcolors.ENDC}'
DALI_IMG_SM = (f'{bcolors.BOLD}DALI Standard 800x600 throughput '
f'(images/second){bcolors.ENDC}')
Expand Down Expand Up @@ -146,6 +148,38 @@ def fio_iops(results: list) -> Tuple[list, list]:
return [read, write]


def fio_125k_bw(results: list) -> Tuple[list, list]:
"""
Save the FIO 125k bandwidth read and write results.
Save the read and write results from the FIO 125k bandwidth tests on an
increasing per-system basis with the first element in the list being the
column header.
Parameters
----------
results : list
A ``list`` of ``dictionaries`` containing all results from the tests.
Returns
-------
tuple
Returns a ``tuple`` of (``list``, ``list``) containing the read and
write 125k bandwidth results, respectively.
"""
try:
read = [FIO_125K_READ_BW] + [bytes_to_gb(result[1]['125k_bandwidth']
['read'])
for result in results]
write = [FIO_125K_WRITE_BW] + [bytes_to_gb(result[1]['125k_bandwidth']
['write'])
for result in results]
except KeyError:
return []
else:
return [read, write]


def nccl(results: list) -> list:
"""
Save the NCCL results.
Expand Down Expand Up @@ -288,6 +322,7 @@ def display_table(json_results: dict) -> NoReturn:

data += fio_bw(results)
data += fio_iops(results)
data += fio_125k_bw(results)
data += nccl(results)
data += dali(results)

Expand Down
4 changes: 4 additions & 0 deletions bobber/lib/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@
RUN_NCCL = 'run-nccl'
RUN_STG_BW = 'run-stg-bw'
RUN_STG_IOPS = 'run-stg-iops'
RUN_STG_125K = 'run-stg-125k'
RUN_STG_META = 'run-stg-meta'

DGX_A100_SINGLE = {
'gpus': 8,
'bw_threads': 16,
'stg_125k_threads': 16,
'iops_threads': 200,
'batch_size_sm': 512,
'batch_size_lg': 256,
Expand All @@ -25,6 +27,7 @@
DGX_A100_DUAL = {
'gpus': 8,
'bw_threads': 16,
'stg_125k_threads': 16,
'iops_threads': 200,
'batch_size_sm': 512,
'batch_size_lg': 256,
Expand All @@ -36,6 +39,7 @@
DGX_2 = {
'gpus': 16,
'bw-threads': 16,
'stg_125k_threads': 16,
'batch-size-sm': 150,
'batch-size-lg': 75,
'iops-threads': 80,
Expand Down
Loading

0 comments on commit 2b2ceb8

Please sign in to comment.