From 3449cf75902bd584ec903ccf58dac49040df04eb Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 4 Mar 2021 16:36:44 +0100 Subject: [PATCH 1/7] ocrd process: simple memory usage measurement --- ocrd/ocrd/processor/helpers.py | 10 +++++++++- ocrd/requirements.txt | 2 ++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/ocrd/ocrd/processor/helpers.py b/ocrd/ocrd/processor/helpers.py index 51a02da496..9cb8817d7f 100644 --- a/ocrd/ocrd/processor/helpers.py +++ b/ocrd/ocrd/processor/helpers.py @@ -5,6 +5,8 @@ import json import inspect from subprocess import run, PIPE +from memory_profiler import memory_usage +from sparklines import sparklines from click import wrap_text from ocrd_utils import getLogger @@ -69,7 +71,13 @@ def run_processor( log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole) t0_wall = perf_counter() t0_cpu = process_time() - processor.process() + mem_usage = memory_usage(proc=processor.process, interval=.1, timeout=None, timestamps=True) + mem_usage_values = [mem for mem, _ in mem_usage] + mem_output = 'memory consumption: ' + mem_output += ''.join(sparklines(mem_usage_values)) + mem_output += '\n' + mem_output += 'max: %.2f MiB min: %.2f MiB' % (max(mem_usage_values), min(mem_usage_values)) + logProfile.info(mem_output) t1_wall = perf_counter() - t0_wall t1_cpu = process_time() - t0_cpu logProfile.info("Executing processor '%s' took %fs (wall) %fs (CPU)( [--input-file-grp='%s' --output-file-grp='%s' --parameter='%s']" % ( diff --git a/ocrd/requirements.txt b/ocrd/requirements.txt index 2da0163b74..ad30bc1f83 100644 --- a/ocrd/requirements.txt +++ b/ocrd/requirements.txt @@ -8,3 +8,5 @@ Flask jsonschema pyyaml Deprecated == 1.2.0 +memory-profiler >= 0.58.0 +sparklines >= 0.4.2 From 0cfa612d45a16982f6b9e9c3a4ae82a49ee7b6ab Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 4 Mar 2021 17:25:23 +0100 Subject: [PATCH 2/7] fix profiler logging test --- tests/test_logging.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_logging.py b/tests/test_logging.py index 95c8a8f27d..60b5714982 100644 --- a/tests/test_logging.py +++ b/tests/test_logging.py @@ -167,7 +167,9 @@ def testProcessorProfiling(self): # with open('/tmp/debug.log', 'w') as f: # f.write(log_contents) # Check whether profile information has been logged. Dummy should finish in under 0.1s - self.assertTrue(match(r'.*Executing processor \'ocrd-test\' took 0.\d+s.*', log_contents)) + # print(log_contents) + assert "Executing processor 'ocrd-test' took 0" in log_contents + # self.assertTrue(match(r'.*Executing processor \'ocrd-test\' took 0.\d+s.*', log_contents)) def test_tmpConfigfile(self): self.assertNotEqual(logging.getLogger('').getEffectiveLevel(), logging.NOTSET) From 7e78a5d1f5670bfedf89f9c9d4dfcc8d88badffc Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 4 Mar 2021 18:12:14 +0100 Subject: [PATCH 3/7] ocrd process: memory_usage w/ multiprocess and include_children --- ocrd/ocrd/processor/helpers.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ocrd/ocrd/processor/helpers.py b/ocrd/ocrd/processor/helpers.py index 9cb8817d7f..230d615829 100644 --- a/ocrd/ocrd/processor/helpers.py +++ b/ocrd/ocrd/processor/helpers.py @@ -71,12 +71,11 @@ def run_processor( log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole) t0_wall = perf_counter() t0_cpu = process_time() - mem_usage = memory_usage(proc=processor.process, interval=.1, timeout=None, timestamps=True) + mem_usage = memory_usage(proc=processor.process, interval=.1, timeout=None, timestamps=True, multiprocess=True, include_children=True) mem_usage_values = [mem for mem, _ in mem_usage] mem_output = 'memory consumption: ' mem_output += ''.join(sparklines(mem_usage_values)) - mem_output += '\n' - mem_output += 'max: %.2f MiB min: %.2f MiB' % (max(mem_usage_values), min(mem_usage_values)) + mem_output += ' max: %.2f MiB min: %.2f MiB' % (max(mem_usage_values), min(mem_usage_values)) logProfile.info(mem_output) t1_wall = perf_counter() - t0_wall t1_cpu = process_time() - t0_cpu From 342316e68d39f58d07a158f4794ec06ae5dec959 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 23 Nov 2022 11:53:06 +0100 Subject: [PATCH 4/7] Update ocrd/ocrd/processor/helpers.py Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- ocrd/ocrd/processor/helpers.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ocrd/ocrd/processor/helpers.py b/ocrd/ocrd/processor/helpers.py index 821468c802..3a112bfead 100644 --- a/ocrd/ocrd/processor/helpers.py +++ b/ocrd/ocrd/processor/helpers.py @@ -87,7 +87,12 @@ def run_processor( log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole) t0_wall = perf_counter() t0_cpu = process_time() - mem_usage = memory_usage(proc=processor.process, interval=.1, timeout=None, timestamps=True, multiprocess=True, include_children=True) + mem_usage = memory_usage(proc=processor.process, + interval=.1, timeout=None, timestamps=True, + # include sub-processes + multiprocess=True, include_children=True, + # get proportional set size instead of RSS + backend='psutil_pss') mem_usage_values = [mem for mem, _ in mem_usage] mem_output = 'memory consumption: ' mem_output += ''.join(sparklines(mem_usage_values)) From a103ac9f2d08d468eb2e95f9ce19a3eef3104161 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 23 Nov 2022 12:00:44 +0100 Subject: [PATCH 5/7] mem_usage: run process only once --- ocrd/ocrd/processor/helpers.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ocrd/ocrd/processor/helpers.py b/ocrd/ocrd/processor/helpers.py index 3a112bfead..3232ea6c2e 100644 --- a/ocrd/ocrd/processor/helpers.py +++ b/ocrd/ocrd/processor/helpers.py @@ -87,7 +87,9 @@ def run_processor( log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole) t0_wall = perf_counter() t0_cpu = process_time() - mem_usage = memory_usage(proc=processor.process, + mem_usage = memory_usage(proc=processor.process, + # only run process once + max_iterations=1, interval=.1, timeout=None, timestamps=True, # include sub-processes multiprocess=True, include_children=True, From 4fdfbf39b3c7de7a858aa231d2098de998fed94e Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 23 Nov 2022 12:46:19 +0100 Subject: [PATCH 6/7] Implement OCRD_PROFILE{_FILE} environemnt variable to configure CPU/memory profiling --- ocrd/ocrd/decorators/__init__.py | 7 +++++++ ocrd/ocrd/processor/helpers.py | 31 ++++++++++++++++++------------- 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/ocrd/ocrd/decorators/__init__.py b/ocrd/ocrd/decorators/__init__.py index d7dee6e6ec..ff908c3e7f 100644 --- a/ocrd/ocrd/decorators/__init__.py +++ b/ocrd/ocrd/decorators/__init__.py @@ -1,4 +1,5 @@ from os.path import isfile +from os import environ import sys import click @@ -89,6 +90,12 @@ def ocrd_cli_wrap_processor( report = WorkspaceValidator.check_file_grp(workspace, kwargs['input_file_grp'], '' if overwrite else kwargs['output_file_grp'], page_id) if not report.is_valid: raise Exception("Invalid input/output file grps:\n\t%s" % '\n\t'.join(report.errors)) + # Set up profiling behavior from environment variables/flags + if not profile and 'OCRD_PROFILE' in environ: + if 'CPU' in environ['OCRD_PROFILE']: + profile = True + if not profile_file and 'OCRD_PROFILE_FILE' in environ: + profile_file = environ['OCRD_PROFILE_FILE'] if profile or profile_file: import cProfile import pstats diff --git a/ocrd/ocrd/processor/helpers.py b/ocrd/ocrd/processor/helpers.py index 3232ea6c2e..e07333eaa2 100644 --- a/ocrd/ocrd/processor/helpers.py +++ b/ocrd/ocrd/processor/helpers.py @@ -1,6 +1,7 @@ """ Helper methods for running and documenting processors """ +from os import environ from time import perf_counter, process_time import json import inspect @@ -87,19 +88,23 @@ def run_processor( log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole) t0_wall = perf_counter() t0_cpu = process_time() - mem_usage = memory_usage(proc=processor.process, - # only run process once - max_iterations=1, - interval=.1, timeout=None, timestamps=True, - # include sub-processes - multiprocess=True, include_children=True, - # get proportional set size instead of RSS - backend='psutil_pss') - mem_usage_values = [mem for mem, _ in mem_usage] - mem_output = 'memory consumption: ' - mem_output += ''.join(sparklines(mem_usage_values)) - mem_output += ' max: %.2f MiB min: %.2f MiB' % (max(mem_usage_values), min(mem_usage_values)) - logProfile.info(mem_output) + if any(x in environ.get('OCRD_PROFILE', '') for x in ['RSS', 'PSS']): + backend = 'psutil_pss' if 'PSS' in environ['OCRD_PROFILE'] else 'psutil' + mem_usage = memory_usage(proc=processor.process, + # only run process once + max_iterations=1, + interval=.1, timeout=None, timestamps=True, + # include sub-processes + multiprocess=True, include_children=True, + # get proportional set size instead of RSS + backend=backend) + mem_usage_values = [mem for mem, _ in mem_usage] + mem_output = 'memory consumption: ' + mem_output += ''.join(sparklines(mem_usage_values)) + mem_output += ' max: %.2f MiB min: %.2f MiB' % (max(mem_usage_values), min(mem_usage_values)) + logProfile.info(mem_output) + else: + processor.process() t1_wall = perf_counter() - t0_wall t1_cpu = process_time() - t0_cpu logProfile.info("Executing processor '%s' took %fs (wall) %fs (CPU)( [--input-file-grp='%s' --output-file-grp='%s' --parameter='%s' --page-id='%s']" % ( From 97a14b0ac79085e9c41af0da45863e65acf9ae9b Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 23 Nov 2022 12:53:48 +0100 Subject: [PATCH 7/7] describe OCRD_PROFILE{,_FILE} in README --- README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/README.md b/README.md index 56ef4ce34f..e9fa84fe17 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ * [Command line tools](#command-line-tools) * [`ocrd` CLI](#ocrd-cli) * [`ocrd-dummy` CLI](#ocrd-dummy-cli) +* [Configuration](#configuration) * [Packages](#packages) * [ocrd_utils](#ocrd_utils) * [ocrd_models](#ocrd_models) @@ -81,6 +82,19 @@ supported flags, options and arguments. A minimal [OCR-D processor](https://ocr-d.de/en/user_guide#using-the-ocr-d-processors) that copies from `-I/-input-file-grp` to `-O/-output-file-grp` +## Configuration + +Almost all behaviour of the OCR-D/core software is configured via CLI options and flags, which can be listed with the `--help` flag that all CLI support. + +Some parts of the software are configured via environement variables: + +* `OCRD_METS_CACHING`: If set to `true`, access to the METS file is cached, speeding in-memory search and modification. +* `OCRD_PROFILE`: This variable configures the built-in CPU and memory profiling. If empty, no profiling is done. Otherwise expected to contain any of the following tokens: + * `CPU`: Enable CPU profiling of processor runs + * `RSS`: Enable RSS memory profiling + * `PSS`: Enable proportionate memory profiling +* `OCRD_PROFILE_FILE`: If set, then the CPU profile is written to this file for later peruse with a analysis tools like [snakeviz](https://jiffyclub.github.io/snakeviz/) + ## Packages ### ocrd_utils