From a4491065cb53c919b3abc04d18670e4ddd8b02b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jes=C3=BAs=20Garc=C3=ADa=20Crespo?= Date: Thu, 12 Sep 2024 09:57:55 +0000 Subject: [PATCH] Use clamav-client instead of clamd --- internal/workflow/assets/workflow.json | 10 +- worker/pyproject.toml | 4 +- worker/settings/common.py | 2 +- worker/tests/test_antivirus.py | 356 +++++++------- worker/tests/test_antivirus_clamdscan.py | 159 ------- worker/tests/test_antivirus_clamscan.py | 79 --- worker/uv.lock | 12 +- .../clientScripts/archivematica_clamscan.py | 449 +++++++----------- worker/worker/utils/custom_handlers.py | 2 +- worker/worker/utils/databaseFunctions.py | 2 +- 10 files changed, 351 insertions(+), 724 deletions(-) delete mode 100644 worker/tests/test_antivirus_clamdscan.py delete mode 100644 worker/tests/test_antivirus_clamscan.py diff --git a/internal/workflow/assets/workflow.json b/internal/workflow/assets/workflow.json index 0e8f0fe9..3cfa2503 100644 --- a/internal/workflow/assets/workflow.json +++ b/internal/workflow/assets/workflow.json @@ -2426,7 +2426,7 @@ "config": { "@manager": "linkTaskManagerFiles", "@model": "StandardTaskConfig", - "arguments": "\"%fileUUID%\" \"%relativeLocation%\" \"%date%\" \"%taskUUID%\"", + "arguments": "\"%fileUUID%\" \"%relativeLocation%\" \"%date%\"", "execute": "archivematicaClamscan_v0.0", "filter_subdir": "objects/submissionDocumentation", "stderr_file": "%SIPLogsDirectory%clamAVScan.txt" @@ -2457,7 +2457,7 @@ "config": { "@manager": "linkTaskManagerFiles", "@model": "StandardTaskConfig", - "arguments": "\"%fileUUID%\" \"%relativeLocation%\" \"%date%\" \"%taskUUID%\"", + "arguments": "\"%fileUUID%\" \"%relativeLocation%\" \"%date%\"", "execute": "archivematicaClamscan_v0.0", "stderr_file": "%SIPLogsDirectory%clamAVScan.txt" }, @@ -2757,7 +2757,7 @@ "config": { "@manager": "linkTaskManagerFiles", "@model": "StandardTaskConfig", - "arguments": "\"%fileUUID%\" \"%relativeLocation%\" \"%date%\" \"%taskUUID%\"", + "arguments": "\"%fileUUID%\" \"%relativeLocation%\" \"%date%\"", "execute": "archivematicaClamscan_v0.0", "stderr_file": "%SIPLogsDirectory%clamAVScan.txt" }, @@ -6335,7 +6335,7 @@ "config": { "@manager": "linkTaskManagerFiles", "@model": "StandardTaskConfig", - "arguments": "\"%fileUUID%\" \"%relativeLocation%\" \"%date%\" \"%taskUUID%\"", + "arguments": "\"%fileUUID%\" \"%relativeLocation%\" \"%date%\"", "execute": "archivematicaClamscan_v0.0", "filter_subdir": "objects/" }, @@ -6882,7 +6882,7 @@ "config": { "@manager": "linkTaskManagerFiles", "@model": "StandardTaskConfig", - "arguments": "\"%fileUUID%\" \"%relativeLocation%\" \"%date%\" \"%taskUUID%\"", + "arguments": "\"%fileUUID%\" \"%relativeLocation%\" \"%date%\"", "execute": "archivematicaClamscan_v0.0", "filter_subdir": "objects/metadata", "stderr_file": "%SIPLogsDirectory%clamAVScan.txt" diff --git a/worker/pyproject.toml b/worker/pyproject.toml index 0d3d74b9..4db8cdb2 100644 --- a/worker/pyproject.toml +++ b/worker/pyproject.toml @@ -7,7 +7,7 @@ dependencies = [ "agentarchives>=0.9.0", "ammcpc>=0.2.0", "bagit", - "clamd>=1.0.2", + "clamav-client>=0.5.1", "django-autoslug>=1.9.9", "django-tastypie>=0.14.7", "gearman3", @@ -89,6 +89,7 @@ warn_unused_configs = true [[tool.mypy.overrides]] module = [ "worker.client.*", + "worker.clientScripts.archivematica_clamscan", "worker.clientScripts.characterize_file", "worker.clientScripts.has_packages", "worker.clientScripts.identify_file_format", @@ -98,6 +99,7 @@ module = [ "worker.clientScripts.validate_file", "worker.utils.executeOrRunSubProcess", "worker.tests.conftest", + "worker.tests.test_antivirus", "worker.tests.test_characterize_file", "worker.tests.test_has_packages", "worker.tests.test_identify_file_format", diff --git a/worker/settings/common.py b/worker/settings/common.py index 38c0ec22..74e9c4b2 100644 --- a/worker/settings/common.py +++ b/worker/settings/common.py @@ -172,7 +172,7 @@ def workers(config, section): server = /var/run/clamav/clamd.ctl pass_by_stream = True client_timeout = 86400 -client_backend = clamdscanner ; Options: clamdscanner or clamscanner +client_backend = clamscanner ; Options: clamdscanner or clamscanner client_max_file_size = 42 ; MB client_max_scan_size = 42 ; MB diff --git a/worker/tests/test_antivirus.py b/worker/tests/test_antivirus.py index 6d0e883a..df914492 100644 --- a/worker/tests/test_antivirus.py +++ b/worker/tests/test_antivirus.py @@ -1,199 +1,173 @@ -"""Tests for the archivematica_clamscan.py client script.""" +""" -from collections import OrderedDict -from collections import namedtuple +Reminders: + +- [*] Validate configuration. +- [ ] Use mypy in VSCode +- [ ] Douglas thinks that we should review what we send to pyprint and logger. + +Testing: + + $ pytest --disable-warnings -pno:randomly -vv --cov --cov-report html:/tmp/coverage.html -x -- tests/test_antivirus.py + $ open file://///tmp/coverage.html/index.html + $ mypy tests/test_antivirus.py worker/clientScripts/archivematica_clamscan.py + +Misc: + +- [ ] mypy pre-commit hook isn't working, move to dagger? + just -> dagger (for everything) +- [ ] dagger to test with mysql (sqlite isn't really supported) + +""" + +from multiprocessing import cpu_count +from unittest import mock import pytest +import pytest_django + +from worker.client.job import Job +from worker.clientScripts.archivematica_clamscan import call +from worker.clientScripts.archivematica_clamscan import concurrent_instances +from worker.main.models import Event +from worker.main.models import File + +TASK_DATE = "2024-09-13T09:17:35.702089+00:00" # isoformat + + +def test_concurrent_instances() -> None: + assert concurrent_instances() == cpu_count() + + +def test_invalid_config_backend( + settings: pytest_django.fixtures.SettingsWrapper, +) -> None: + settings.CLAMAV_CLIENT_BACKEND = "invalid-backend" + with pytest.raises(ValueError, match="Unsupported backend type: invalid-backend"): + call([]) + + +@pytest.mark.django_db +def test_valid_backends( + settings: pytest_django.fixtures.SettingsWrapper, +) -> None: + for backend in ( + "clamscanner", # » ``clamav_client.scanner.ClamscanScanner``. + "clamscan", # » ``clamav_client.scanner.ClamscanScanner``. + "clamdscanner", # » ``clamav_client.scanner.ClamdScanner``. + "clamd", # » ``clamav_client.scanner.ClamdScanner``. + ): + settings.CLAMAV_CLIENT_BACKEND = backend + call([]) + + +@pytest.mark.django_db +def test_invalid_file_id() -> None: + job = mock.Mock( + args=[ + "archivematica_clamscan.py", + "None", + "path", + TASK_DATE, + ], + JobContext=mock.MagicMock(), + spec=Job, + ) + + call([job]) + + job.set_status.assert_called_once_with(1) + + assert Event.objects.filter(event_type="virus check").count() == 0 + + +@pytest.mark.django_db +def test_file_already_scanned(transfer_file: File) -> None: + Event.objects.create(file_uuid=transfer_file, event_type="virus check", event_outcome="Pass") + + job = mock.Mock( + args=[ + "archivematica_clamscan.py", + str(transfer_file.pk), + transfer_file.currentlocation.decode(), + TASK_DATE, + ], + JobContext=mock.MagicMock(), + spec=Job, + ) + + call([job]) + + job.set_status.assert_called_once_with(0) + assert ( + Event.objects.filter(file_uuid=transfer_file, event_type="virus check").count() + == 1 + ) + -from tests import test_antivirus_clamdscan -from worker.clientScripts import archivematica_clamscan - - -def test_get_scanner(settings): - """Test that get_scanner returns the correct instance of antivirus - per the user's configuration. Test return of clamdscanner by default.""" - - # Ensure that environment settings are available to the mock classes. - test_antivirus_clamdscan.setup_clamdscanner(settings) - - # Testing to ensure clamscanner is returned when explicitly set. - settings.CLAMAV_CLIENT_BACKEND = "clamscanner" - scanner = archivematica_clamscan.get_scanner() - assert isinstance(scanner, archivematica_clamscan.ClamScanner) - - # Testing to ensure that clamdscanner is returned when explicitly set. - settings.CLAMAV_CLIENT_BACKEND = "clamdscanner" - scanner = archivematica_clamscan.get_scanner() - assert isinstance(scanner, archivematica_clamscan.ClamdScanner) - - # Testing to ensure that clamdscanner is the default returned scanner. - settings.CLAMAV_CLIENT_BACKEND = "fprot" - scanner = archivematica_clamscan.get_scanner() - assert isinstance(scanner, archivematica_clamscan.ClamdScanner) - - # Testing to ensure that clamdscanner is the default returned scanner when - # the user configures an empty string. - settings.CLAMAV_CLIENT_BACKEND = "" - scanner = archivematica_clamscan.get_scanner() - assert isinstance(scanner, archivematica_clamscan.ClamdScanner) - - # Testing to ensure that clamdscanner is returned when the environment - # hasn't been configured appropriately and None is returned. - settings.CLAMAV_CLIENT_BACKEND = None - scanner = archivematica_clamscan.get_scanner() - assert isinstance(scanner, archivematica_clamscan.ClamdScanner) - - # Testing to ensure that clamdscanner is returned when another variable - # type is specified, e.g. in this instance, an integer. - settings.CLAMAV_CLIENT_BACKEND = 10 - scanner = archivematica_clamscan.get_scanner() - assert isinstance(scanner, archivematica_clamscan.ClamdScanner) - - -args = OrderedDict() -args["file_uuid"] = "ec26199f-72a4-4fd8-a94a-29144b02ddd8" -args["path"] = "/path" -args["date"] = "2019-12-01" -args["task_uuid"] = "c380e94e-7a7b-4ab8-aa72-ec0644cc3f5d" - - -class FileMock: - def __init__(self, size): - self.size = size - - -class ScannerMock(archivematica_clamscan.ScannerBase): - PROGRAM = "Mock" - - def __init__(self, should_except=False, passed=False): - self.should_except = should_except - self.passed = passed - - def scan(self, path): - if self.should_except: - raise Exception("Something really bad happened!") - return self.passed, None, None - - def version_attrs(self): - return ("version", "virus_definitions") - - -def setup_test_scan_file_mocks( - mocker, - file_already_scanned=False, - file_size=1024, - scanner_should_except=False, - scanner_passed=False, -): - deps = namedtuple("deps", ["file_already_scanned", "file_get", "scanner"])( - file_already_scanned=mocker.patch( - "worker.clientScripts.archivematica_clamscan.file_already_scanned", - return_value=file_already_scanned, - ), - file_get=mocker.patch( - "worker.main.models.File.objects.get", return_value=FileMock(size=file_size) - ), - scanner=ScannerMock(should_except=scanner_should_except, passed=scanner_passed), +@pytest.mark.django_db +def test_unsized_file(transfer_file: File) -> None: + job = mock.Mock( + args=[ + "archivematica_clamscan.py", + str(transfer_file.pk), + transfer_file.currentlocation.decode(), + TASK_DATE, + ], + JobContext=mock.MagicMock(), + spec=Job, ) - mocker.patch( - "worker.clientScripts.archivematica_clamscan.get_scanner", - return_value=deps.scanner, + call([job]) + + job.set_status.assert_called_once_with(1) + assert ( + Event.objects.filter( + file_uuid=transfer_file, event_type="virus check", event_outcome="Fail" + ).count() + == 1 ) - return deps - - -def test_scan_file_already_scanned(mocker): - deps = setup_test_scan_file_mocks(mocker, file_already_scanned=True) - - exit_code = archivematica_clamscan.scan_file([], **dict(args)) - - assert exit_code == 0 - deps.file_already_scanned.assert_called_once_with(args["file_uuid"]) - - -QueueEventParams = namedtuple("QueueEventParams", ["scanner_is_None", "passed"]) - - -@pytest.mark.parametrize( - "setup_kwargs, exit_code, queue_event_params", - [ - # File size too big for given file_size param - ( - {"file_size": 43, "scanner_passed": None}, - 0, - QueueEventParams(scanner_is_None=None, passed=None), - ), - # File size too big for given file_scan param - ( - {"file_size": 85, "scanner_passed": None}, - 0, - QueueEventParams(scanner_is_None=None, passed=None), - ), - # File size within given file_size param, and file_scan param - ( - {"file_size": 42, "scanner_passed": True}, - 0, - QueueEventParams(scanner_is_None=False, passed=True), - ), - # Scan returns None with no-error, e.g. Broken Pipe - ( - {"scanner_passed": None}, - 0, - QueueEventParams(scanner_is_None=None, passed=None), - ), - # Zero byte file passes - ( - {"file_size": 0, "scanner_passed": True}, - 0, - QueueEventParams(scanner_is_None=False, passed=True), - ), - # Virus found - ( - {"scanner_passed": False}, - 1, - QueueEventParams(scanner_is_None=False, passed=False), - ), - # Passed - ( - {"scanner_passed": True}, - 0, - QueueEventParams(scanner_is_None=False, passed=True), - ), - ], -) -def test_scan_file(mocker, setup_kwargs, exit_code, queue_event_params, settings): - setup_test_scan_file_mocks(mocker, **setup_kwargs) - - # Here the user configurable thresholds for maimum file size, and maximum - # scan size are being tested. The scan size is offset so as to enable the - # test to fall through correctly and eventually return None for - # not-scanned. - settings.CLAMAV_CLIENT_MAX_FILE_SIZE = 42 - settings.CLAMAV_CLIENT_MAX_SCAN_SIZE = 84 - - event_queue = [] - - ret = archivematica_clamscan.scan_file(event_queue, **dict(args)) - - # The integer returned by scan_file() is going to be used as the exit code - # of the archivematica_clamscan.py script which is important for the AM - # workflow in order to control what to do next. - assert exit_code == ret - - # A side effect of scan_file() is to queue an event to be created in the - # database. - if queue_event_params.passed is None: - assert len(event_queue) == 0 - else: - assert len(event_queue) == 1 - - event = event_queue[0] - assert event["eventType"] == "virus check" - assert event["fileUUID"] == args["file_uuid"] - assert ( - event["eventOutcome"] == "Pass" - if setup_kwargs["scanner_passed"] - else "Fail" - ) + +@pytest.mark.django_db +def test_unexistent_file() -> None: + job = mock.Mock( + args=[ + "archivematica_clamscan.py", + "6cc38bf0-d8e2-414f-a8f8-c946ae2b5255", + "/path", + TASK_DATE, + ], + JobContext=mock.MagicMock(), + spec=Job, + ) + + call([job]) + + job.set_status.assert_called_once_with(1) + assert Event.objects.filter(event_type="virus check").count() == 0 + + +@pytest.mark.django_db +def test_file_size_exceeding_max_settings() -> None: + pass # TODO + + +@pytest.mark.django_db +def test_scan_exception() -> None: + pass # TODO + + +@pytest.mark.django_db +def test_scan_passed() -> None: + pass # TODO + + +@pytest.mark.django_db +def test_scan_found() -> None: + pass # TODO + + +@pytest.mark.django_db +def test_scan_many() -> None: + pass # TODO diff --git a/worker/tests/test_antivirus_clamdscan.py b/worker/tests/test_antivirus_clamdscan.py deleted file mode 100644 index cb978f8c..00000000 --- a/worker/tests/test_antivirus_clamdscan.py +++ /dev/null @@ -1,159 +0,0 @@ -"""Tests for the archivematica_clamscan.py client script.""" - -import errno -from collections import namedtuple - -from clamd import BufferTooLongError -from clamd import ClamdNetworkSocket -from clamd import ClamdUnixSocket -from clamd import ConnectionError - -from worker.clientScripts import archivematica_clamscan - - -def setup_clamdscanner( - settings, addr="/var/run/clamav/clamd.ctl", timeout=10, stream=False -): - settings.CLAMAV_SERVER = addr - settings.CLAMAV_CLIENT_TIMEOUT = timeout - settings.CLAMAV_PASS_BY_STREAM = stream - - return archivematica_clamscan.ClamdScanner() - - -def test_clamdscanner_version_props(mocker, settings): - scanner = setup_clamdscanner(settings) - mocker.patch.object( - scanner, - "version_attrs", - return_value=("ClamAV 0.99.2", "23992/Fri Oct 27 05:04:12 2017"), - ) - - assert scanner.program() == "ClamAV (clamd)" - assert scanner.version() == "ClamAV 0.99.2" - assert scanner.virus_definitions() == "23992/Fri Oct 27 05:04:12 2017" - - -def test_clamdscanner_version_attrs(mocker, settings): - scanner = setup_clamdscanner(settings, addr="/var/run/clamav/clamd.ctl") - version = mocker.patch.object( - scanner.client, - "version", - return_value="ClamAV 0.99.2/23992/Fri Oct 27 05:04:12 2017", - ) - - assert scanner.version_attrs() == ( - "ClamAV 0.99.2", - "23992/Fri Oct 27 05:04:12 2017", - ) - version.assert_called_once() - - -def test_clamdscanner_get_client(settings): - scanner = setup_clamdscanner(settings, addr="/var/run/clamav/clamd.ctl") - assert isinstance(scanner.client, ClamdUnixSocket) - - scanner = setup_clamdscanner(settings, addr="127.0.0.1:1234", timeout=15.5) - assert isinstance(scanner.client, ClamdNetworkSocket) - assert scanner.client.host == "127.0.0.1" - assert scanner.client.port == 1234 - assert scanner.client.timeout == 15.5 - - -def test_clamdscanner_scan(mocker, settings): - OKAY_RET = ("OK", None) - ERROR_RET = ("ERROR", "Permission denied") - FOUND_RET = ("FOUND", "Eicar-Test-Signature") - - def patch(scanner, ret=OKAY_RET, excepts=False): - """Patch the scanner function and enable testing of exceptions raised - by clamdscanner that we want to control. excepts can take an argument - of True to pass a generic exception. excepts can also take an exception - as an argument for better granularity. - """ - deps = namedtuple("deps", ["pass_by_stream", "pass_by_reference"])( - pass_by_stream=mocker.patch.object( - scanner, "pass_by_stream", return_value={"stream": ret} - ), - pass_by_reference=mocker.patch.object( - scanner, "pass_by_reference", return_value={"/file": ret} - ), - ) - if excepts is not False: - e = excepts - if excepts is True: - e = Exception("Testing an unmanaged exception.") - deps.pass_by_stream.side_effect = e - deps.pass_by_reference.side_effect = e - return deps - - scanner = setup_clamdscanner(settings, stream=False) - deps = patch(scanner, ret=OKAY_RET) - passed, state, details = scanner.scan("/file") - assert passed is True - assert state == "OK" - assert details is None - deps.pass_by_stream.assert_not_called() - deps.pass_by_reference.assert_called_once() - - scanner = setup_clamdscanner(settings, stream=True) - deps = patch(scanner, ret=OKAY_RET) - passed, state, details = scanner.scan("/file") - assert passed is True - assert state == "OK" - assert details is None - deps.pass_by_stream.assert_called_once() - deps.pass_by_reference.assert_not_called() - - patch(scanner, ret=ERROR_RET) - passed, state, details = scanner.scan("/file") - assert passed is False - assert state == "ERROR" - assert details == "Permission denied" - - patch(scanner, ret=FOUND_RET) - passed, state, details = scanner.scan("/file") - assert passed is False - assert state == "FOUND" - assert details == "Eicar-Test-Signature" - - # Testing a generic Exception returned by the clamdscan micorservice. - patch(scanner, ret=OKAY_RET, excepts=True) - passed, state, details = scanner.scan("/file") - assert passed is False - assert state is None - assert details is None - - # Testing a generic IOError that is not a broken pipe error that we're - # expecting to be able to manage from clamdscan. - patch(scanner, ret=OKAY_RET, excepts=OSError("Testing a generic IO Error")) - passed, state, details = scanner.scan("/file") - assert passed is False - assert state is None - assert details is None - - # Broken pipe is a known error from the clamd library. - brokenpipe_error = OSError("Testing a broken pipe error") - brokenpipe_error.errno = errno.EPIPE - patch(scanner, ret=OKAY_RET, excepts=brokenpipe_error) - passed, state, details = scanner.scan("/file") - assert passed is None - assert state is None - assert details is None - - # The INSTREAM size limit error is known to us; test it here. - instream_error = BufferTooLongError("INSTREAM size limit exceeded. ERROR.") - patch(scanner, ret=OKAY_RET, excepts=instream_error) - passed, state, details = scanner.scan("/file") - assert passed is None - assert state is None - assert details is None - - # The clamd library can return a further error code here, and we we test it - # to make sure that if it does, it is managed. - connection_error = ConnectionError("Error while reading from socket.") - patch(scanner, ret=OKAY_RET, excepts=connection_error) - passed, state, details = scanner.scan("/file") - assert passed is None - assert state is None - assert details is None diff --git a/worker/tests/test_antivirus_clamscan.py b/worker/tests/test_antivirus_clamscan.py deleted file mode 100644 index 6a5deaa1..00000000 --- a/worker/tests/test_antivirus_clamscan.py +++ /dev/null @@ -1,79 +0,0 @@ -"""Tests for the archivematica_clamscan.py client script.""" - -import subprocess - -import pytest - -from worker.clientScripts import archivematica_clamscan - - -@pytest.mark.parametrize( - "version, want", - [ - ( - "ClamAV 0.99.2/23992/Fri Oct 27 05:04:12 2017", - ("ClamAV 0.99.2", "23992/Fri Oct 27 05:04:12 2017"), - ), - ("ClamAV 0.99.2", ("ClamAV 0.99.2", None)), - ("Unexpected value", (None, None)), - ], -) -def test_clamav_version_parts(version, want): - got = archivematica_clamscan.clamav_version_parts(version) - assert got == want - - -def setup_clamscanner(): - return archivematica_clamscan.ClamScanner() - - -def test_clamscanner_version_props(mocker): - scanner = setup_clamscanner() - mocker.patch.object( - scanner, - "version_attrs", - return_value=("ClamAV 0.99.2", "23992/Fri Oct 27 05:04:12 2017"), - ) - - assert scanner.program() == "ClamAV (clamscan)" - assert scanner.version() == "ClamAV 0.99.2" - assert scanner.virus_definitions() == "23992/Fri Oct 27 05:04:12 2017" - - -def test_clamscanner_version_attrs(mocker, settings): - scanner = setup_clamscanner() - mock = mocker.patch.object( - scanner, "_call", return_value="ClamAV 0.99.2/23992/Fri Oct 27 05:04:12 2017" - ) - - assert scanner.version_attrs() == ( - "ClamAV 0.99.2", - "23992/Fri Oct 27 05:04:12 2017", - ) - mock.assert_called_once_with("-V") - - -def test_clamscanner_scan(mocker, settings): - scanner = setup_clamscanner() - mock = mocker.patch.object(scanner, "_call", return_value="Output of clamscan") - - # User configured thresholds need to be sent through to clamscanner and - # executed as part of the call to it. - settings.CLAMAV_CLIENT_MAX_FILE_SIZE = 20 - settings.CLAMAV_CLIENT_MAX_SCAN_SIZE = 20 - - max_file_size = "--max-filesize=%dM" % settings.CLAMAV_CLIENT_MAX_FILE_SIZE - max_scan_size = "--max-scansize=%dM" % settings.CLAMAV_CLIENT_MAX_SCAN_SIZE - - assert scanner.scan("/file") == (True, "OK", None) - mock.assert_called_once_with(max_file_size, max_scan_size, "/file") - - mock.side_effect = subprocess.CalledProcessError( - 1, "clamscan", "Output of clamscan" - ) - assert scanner.scan("/file") == (False, "FOUND", None) - - mock.side_effect = subprocess.CalledProcessError( - 2, "clamscan", "Output of clamscan" - ) - assert scanner.scan("/file") == (False, "ERROR", None) diff --git a/worker/uv.lock b/worker/uv.lock index fd85fa95..db284503 100644 --- a/worker/uv.lock +++ b/worker/uv.lock @@ -74,12 +74,12 @@ wheels = [ ] [[package]] -name = "clamd" -version = "1.0.2" +name = "clamav-client" +version = "0.5.1" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/14/8b/55332f1f79f28a5ccc50f66364087e64fae8e4ed62e52007ca82b3072221/clamd-1.0.2.tar.gz", hash = "sha256:d82a2fd814684a35a1b31feadafb2e69c8ebde9403613f6bdaa5d877c0f29560", size = 8218 } +sdist = { url = "https://files.pythonhosted.org/packages/e3/34/556300be3f992627749ab205055e2faf149c56b2a42dc3335291c9b84f6a/clamav_client-0.5.1.tar.gz", hash = "sha256:cfab214aeff3ea2ff293f4c164c1eb2330b5c34081d4a0aa3012e481ab7ab137", size = 29643 } wheels = [ - { url = "https://files.pythonhosted.org/packages/3d/d0/84614de2a53ad52370adc9f9260bea420e53e0c228a248ec0eacfa65ccbb/clamd-1.0.2-py2.py3-none-any.whl", hash = "sha256:5c32546b7d1eb00fd6be00a889d79e00fbf980ed082826ccfa369bce3dcff5e7", size = 6684 }, + { url = "https://files.pythonhosted.org/packages/5d/39/6b4b05a2900de88c69d6e35400bb8bed12b91a28ed9bb4112ce2c2d59af0/clamav_client-0.5.1-py3-none-any.whl", hash = "sha256:1f5c8465f09e5158fe3f10bb552ccc7ff1e0c7e10c597ed481c9ba1d75b74803", size = 13593 }, ] [[package]] @@ -481,7 +481,7 @@ dependencies = [ { name = "agentarchives" }, { name = "ammcpc" }, { name = "bagit" }, - { name = "clamd" }, + { name = "clamav-client" }, { name = "django" }, { name = "django-autoslug" }, { name = "django-tastypie" }, @@ -510,7 +510,7 @@ requires-dist = [ { name = "agentarchives", specifier = ">=0.9.0" }, { name = "ammcpc", specifier = ">=0.2.0" }, { name = "bagit", git = "https://github.com/artefactual-labs/bagit-python?rev=902051d8410219f6c5f4ce6d43e5b272cf29e89b" }, - { name = "clamd", specifier = ">=1.0.2" }, + { name = "clamav-client", specifier = ">=0.5.1" }, { name = "django", specifier = ">=4.2,<5" }, { name = "django-autoslug", specifier = ">=1.9.9" }, { name = "django-tastypie", specifier = ">=0.14.7" }, diff --git a/worker/worker/clientScripts/archivematica_clamscan.py b/worker/worker/clientScripts/archivematica_clamscan.py index 15085075..2d020eaa 100755 --- a/worker/worker/clientScripts/archivematica_clamscan.py +++ b/worker/worker/clientScripts/archivematica_clamscan.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # This file is part of Archivematica. # -# Copyright 2010-2017 Artefactual Systems Inc. +# Copyright 2010-2024 Artefactual Systems Inc. # # Archivematica is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by @@ -15,26 +15,41 @@ # # You should have received a copy of the GNU General Public License # along with Archivematica. If not, see . -import abc +"""Virus scanner compatible with ClamAV. + +Settings available: + +- ``CLAMAV_CLIENT_BACKEND`` (str) +- ``CLAMAV_SERVER`` (str) +- ``CLAMAV_PASS_BY_STREAM`` (bool) +- ``CLAMAV_CLIENT_TIMEOUT`` (float) +- ``CLAMAV_CLIENT_MAX_FILE_SIZE`` (float) +- ``CLAMAV_CLIENT_MAX_SCAN_SIZE`` (float) +""" + import argparse -import errno +import dataclasses import multiprocessing import os -import re -import subprocess import uuid +from typing import List +from typing import Literal +from typing import Optional +from typing import TypedDict +from typing import cast import django -from clamd import BufferTooLongError -from clamd import ClamdNetworkSocket -from clamd import ClamdUnixSocket -from clamd import ConnectionError -from django.conf import settings as django_settings +from clamav_client import get_scanner +from clamav_client.scanner import Scanner +from clamav_client.scanner import ScannerConfig +from django.conf import settings from django.core.exceptions import ValidationError from django.db import transaction +from django.utils.functional import LazyObject django.setup() +from worker.client.job import Job from worker.main.models import Event from worker.main.models import File from worker.utils.custom_handlers import get_script_logger @@ -43,203 +58,56 @@ logger = get_script_logger("archivematica.worker.clamscan") -def concurrent_instances(): +def concurrent_instances() -> int: return multiprocessing.cpu_count() -def clamav_version_parts(ver): - """Both clamscan and clamd return a version string that looks like the - following:: +CLAMD_NAMES = ("clamdscanner", "clamd") +CLAMSCAN_NAMES = ("clamscanner", "clamscan") - ClamAV 0.99.2/23992/Fri Oct 27 05:04:12 2017 - Given the example above, this function returns a tuple as follows:: +EventOutcome = Literal["Pass", "Fail"] - ("ClamAV 0.99.2", "23992/Fri Oct 27 05:04:12 2017") - Both elements may be None if the matching failed. - """ - parts = ver.split("/") - n = len(parts) - if n == 1: - version = parts[0] - if re.match("^ClamAV", version): - return version, None - elif n == 3: - version, defs, date = parts - return version, f"{defs}/{date}" - return None, None - - -class ScannerBase(metaclass=abc.ABCMeta): - @abc.abstractmethod - def scan(self, path): - """Scan a file and return a tuple of three elements reporting the - results. These are the three elements expected: - 1. passed (bool) - 2. state (str - "OK", "ERROR", or "FOUND") - 3. details (str - extra info when ERROR or FOUND) - """ - - @abc.abstractproperty - def version_attrs(self): - """Obtain the version details. It is expected to return a tuple of two - elements: ClamAV version number and virus definition version number. - The implementor can cache the results. - """ - - def program(self): - return self.PROGRAM - - def version(self): - return self.version_attrs()[0] - - def virus_definitions(self): - return self.version_attrs()[1] - - -class ClamdScanner(ScannerBase): - PROGRAM = "ClamAV (clamd)" - - def __init__(self): - self.addr = django_settings.CLAMAV_SERVER - self.timeout = django_settings.CLAMAV_CLIENT_TIMEOUT - self.stream = django_settings.CLAMAV_PASS_BY_STREAM - self.client = self.get_client() - - def scan(self, path): - if self.stream: - method_name = "pass_by_stream" - result_key = "stream" - else: - method_name = "pass_by_reference" - result_key = path - - passed, state, details = (False, None, None) - try: - result = getattr(self, method_name)(path) - state, details = result[result_key] - except Exception as err: - passed = ClamdScanner.clamd_exception_handler(err) - if state == "OK": - passed = True - return passed, state, details - - @staticmethod - def clamd_exception_handler(err): - """Manage each decision for an exception when it is raised. Ensure - that each decision can be tested to meet the documented Archivematica - antivirus feature definition. - """ - if isinstance(err, IOError): - if err.errno == errno.EPIPE: - logger.error( - "[Errno 32] Broken pipe. File not scanned. Check Clamd " - "StreamMaxLength" - ) - return None - elif isinstance(err, BufferTooLongError): - logger.error( - "Clamd BufferTooLongError. File not scanned. Check Clamd " - "StreamMaxLength" - ) - return None - elif isinstance(err, ConnectionError): - logger.error( - "Clamd ConnectionError. File not scanned. Check Clamd " "output: %s", - err, - ) - return None - # Return False and provide some information to the user for all other - # failures. - logger.error("Virus scanning failed: %s", err, exc_info=True) - return False +class QueuedEvent(TypedDict): + fileUUID: str + eventIdentifierUUID: str + eventType: str + eventDateTime: str + eventDetail: str + eventOutcome: EventOutcome - def version_attrs(self): - try: - self._version_attrs - except AttributeError: - self._version_attrs = clamav_version_parts(self.client.version()) - return self._version_attrs - def get_client(self): - if ":" not in self.addr: - return ClamdUnixSocket(path=self.addr) - host, port = self.addr.split(":") - return ClamdNetworkSocket(host=host, port=int(port), timeout=self.timeout) +EventQueue = List[QueuedEvent] - def pass_by_reference(self, path): - logger.info( - "File being being read by Clamdscan from filesystem \ - reference." - ) - return self.client.scan(path) - - def pass_by_stream(self, path): - logger.info("File contents being streamed to Clamdscan.") - return self.client.instream(open(path, "rb")) - - -class ClamScanner(ScannerBase): - PROGRAM = "ClamAV (clamscan)" - COMMAND = "clamscan" - - def _call(self, *args): - return subprocess.check_output((self.COMMAND,) + args) - - def scan(self, path): - passed, state, details = (False, "ERROR", None) - try: - max_file_size = ( - "--max-filesize=%dM" % django_settings.CLAMAV_CLIENT_MAX_FILE_SIZE - ) - max_scan_size = ( - "--max-scansize=%dM" % django_settings.CLAMAV_CLIENT_MAX_SCAN_SIZE - ) - self._call(max_file_size, max_scan_size, path) - except subprocess.CalledProcessError as err: - if err.returncode == 1: - state = "FOUND" - else: - logger.error("Virus scanning failed: %s", err.output, exc_info=True) - else: - passed, state = (True, "OK") - return passed, state, details - - def version_attrs(self): - try: - self._version_attrs - except AttributeError: - try: - self._version_attrs = clamav_version_parts(self._call("-V")) - except subprocess.CalledProcessError: - self._version_attrs = (None, None) - return self._version_attrs - - -def file_already_scanned(file_uuid): - return ( - file_uuid != "None" - and Event.objects.filter( - file_uuid_id=file_uuid, event_type="virus check" - ).exists() - ) +@dataclasses.dataclass +class Args: + file_uuid: str + path: str + date: str -def queue_event(file_uuid, date, scanner, passed, queue): - if passed is None or file_uuid == "None": - return - event_detail = "" - if scanner is not None: - event_detail = f'program="{scanner.program()}"; version="{scanner.version()}"; virusDefinitions="{scanner.virus_definitions()}"' +def file_already_scanned(file_id: uuid.UUID) -> bool: + qs = Event.objects.filter(file_uuid_id=file_id, event_type="virus check") + return cast(bool, qs.exists()) - outcome = "Pass" if passed else "Fail" - logger.info("Recording new event for file %s (outcome: %s)", file_uuid, outcome) +def queue_event( + scanner: Scanner, + queue: EventQueue, + file_id: uuid.UUID, + date: str, + passed: bool, +) -> None: + event_detail = "" + info = scanner.info() # This is cached. + event_detail = f'program="{info.name}"; version="{info.version}"; virusDefinitions="{info.virus_definitions}"' + outcome: EventOutcome = "Pass" if passed else "Fail" + logger.info("Recording new event for file %s (outcome: %s)", file_id, outcome) queue.append( { - "fileUUID": file_uuid, + "fileUUID": str(file_id), "eventIdentifierUUID": str(uuid.uuid4()), "eventType": "virus check", "eventDateTime": date, @@ -249,47 +117,13 @@ def queue_event(file_uuid, date, scanner, passed, queue): ) -def get_parser(): - """Return a ``Namespace`` with the parsed arguments.""" - parser = argparse.ArgumentParser() - parser.add_argument("file_uuid", metavar="fileUUID") - parser.add_argument("path", metavar="PATH", help="File or directory location") - parser.add_argument("date", metavar="DATE") - parser.add_argument( - "task_uuid", metavar="taskUUID", help="Currently unused, feel free to ignore." - ) - return parser - - -SCANNERS = (ClamScanner, ClamdScanner) -SCANNERS_NAMES = tuple(b.__name__.lower() for b in SCANNERS) -DEFAULT_SCANNER = ClamdScanner - - -def get_scanner(): - """Return the ClamAV client configured by the user and found in the - installation's environment variables. Clamdscanner may perform quicker - than Clamscanner given a larger number of objects. Return clamdscanner - object as a default if no other, or an incorrect value is specified. - """ - choice = str(django_settings.CLAMAV_CLIENT_BACKEND).lower() - if choice not in SCANNERS_NAMES: - logger.warning( - "Unexpected antivirus scanner (CLAMAV_CLIENT_BACKEND):" ' "%s"; using %s.', - choice, - DEFAULT_SCANNER.__name__, - ) - return DEFAULT_SCANNER() - return SCANNERS[SCANNERS_NAMES.index(choice)]() - - -def get_size(file_uuid, path): +def get_size(file_id: uuid.UUID, path: str) -> Optional[int]: # We're going to see this happening when files are not part of `objects/`. - if file_uuid != "None": - try: - return File.objects.get(uuid=file_uuid).size - except (File.DoesNotExist, ValidationError): - pass + try: + size = File.objects.get(uuid=file_id).size + return cast(Optional[int], size) + except (File.DoesNotExist, ValidationError): + pass # Our fallback. try: return os.path.getsize(path) @@ -297,78 +131,133 @@ def get_size(file_uuid, path): return None -def scan_file(event_queue, file_uuid, path, date, task_uuid): - if file_already_scanned(file_uuid): - logger.info("Virus scan already performed, not running scan again") - return 0 +def valid_max_settings(size: int, max_file_size: float, max_scan_size: float) -> bool: + max_file_size = max_file_size * 1024 * 1024 + max_scan_size = max_scan_size * 1024 * 1024 + if size > max_file_size: + logger.info( + "File will not be scanned. Size %s bytes greater than scanner " + "max file size %s bytes", + size, + max_file_size, + ) + return False + elif size > max_scan_size: + logger.info( + "File will not be scanned. Size %s bytes greater than scanner " + "max scan size %s bytes", + size, + max_scan_size, + ) + return False + return True - scanner, passed = None, False +def scan_file( + scanner: Scanner, + event_queue: EventQueue, + opts: Args, +) -> int: + """Scan an individual file. + + Returns 1 to indicate that analyzing the file was impossible. + Returns 0 if the file can proceed through the process without errors. + """ + try: + file_id = uuid.UUID(opts.file_uuid) + except Exception as err: + logger.error("File skipped: file_uuid (%s) is not a valid UUID.", err) + return 1 + if file_already_scanned(file_id): + logger.info("Virus scan already performed, not running scan again") + return 0 + passed: Optional[bool] = False try: - size = get_size(file_uuid, path) + size = get_size(file_id, opts.path) if size is None: + passed = None logger.error("Getting file size returned: %s", size) return 1 - - max_file_size = django_settings.CLAMAV_CLIENT_MAX_FILE_SIZE * 1024 * 1024 - max_scan_size = django_settings.CLAMAV_CLIENT_MAX_SCAN_SIZE * 1024 * 1024 - - valid_scan = True - - if size > max_file_size: - logger.info( - "File will not be scanned. Size %s bytes greater than scanner " - "max file size %s bytes", - size, - max_file_size, - ) - valid_scan = False - elif size > max_scan_size: - logger.info( - "File will not be scanned. Size %s bytes greater than scanner " - "max scan size %s bytes", - size, - max_scan_size, - ) - valid_scan = False - - if valid_scan: - scanner = get_scanner() - logger.info( - "Using scanner %s (%s - %s)", - scanner.program(), - scanner.version(), - scanner.virus_definitions(), - ) - - passed, state, details = scanner.scan(path) + if valid_max_settings( + size, + settings.CLAMAV_CLIENT_MAX_FILE_SIZE, + settings.CLAMAV_CLIENT_MAX_SCAN_SIZE, + ): + result = scanner.scan(opts.path) + passed = True + state = result.state + details = result.details else: passed, state, details = None, None, None - except Exception: - logger.error("Unexpected error scanning file %s", path, exc_info=True) + logger.error("Unexpected error scanning file %s", opts.path, exc_info=True) return 1 else: # record pass or fail, but not None if the file hasn't # been scanned, e.g. Max File Size thresholds being too low. if passed is not None: - logger.info("File %s scanned!", path) + logger.info("File %s scanned!", opts.path) logger.debug("passed=%s state=%s details=%s", passed, state, details) finally: - queue_event(file_uuid, date, scanner, passed, event_queue) - - # If True or None, then we have no error, the file can move through the - # process as expected... + if passed is not None: + queue_event(scanner, event_queue, file_id, opts.date, passed) return 1 if passed is False else 0 -def call(jobs): - event_queue = [] +def build_scanner(settings: LazyObject) -> Scanner: + backend = str(settings.CLAMAV_CLIENT_BACKEND).lower() + config: ScannerConfig + if backend in CLAMD_NAMES: + config = { + "backend": "clamd", + "address": str(settings.CLAMAV_SERVER), + "timeout": int(settings.CLAMAV_CLIENT_TIMEOUT), + "stream": bool(settings.CLAMAV_PASS_BY_STREAM), + } + elif backend in CLAMSCAN_NAMES: + config = { + "backend": "clamscan", + "max_file_size": float(settings.CLAMAV_CLIENT_MAX_FILE_SIZE), + "max_scan_size": float(settings.CLAMAV_CLIENT_MAX_SCAN_SIZE), + } + return get_scanner(config) + +def get_parser() -> argparse.ArgumentParser: + """Return a ``Namespace`` with the parsed arguments.""" + parser = argparse.ArgumentParser() + parser.add_argument("file_uuid", metavar="fileUUID") + parser.add_argument("path", metavar="PATH", help="File or directory location") + parser.add_argument("date", metavar="DATE") + return parser + + +def parse_args(parser: argparse.ArgumentParser, job: Job) -> Args: + namespace = parser.parse_args(job.args[1:]) + + return Args(**vars(namespace)) + + +def main(jobs: List[Job]) -> None: + parser = get_parser() + event_queue: EventQueue = [] + scanner = build_scanner(settings) + info = scanner.info() + logger.info( + "Using scanner %s (%s - %s)", info.name, info.version, info.virus_definitions + ) + + # TODO: refactor to scan the entire batch of jobs at once, rather than + # processing one job at a time. This is particularly beneficial when using + # ClamscanScanner because we only ask ClamAV to load the signatures once. for job in jobs: with job.JobContext(logger=logger): - job.set_status(scan_file(event_queue, *job.args[1:])) - + opts = parse_args(parser, job) + job.set_status(scan_file(scanner, event_queue, opts)) with transaction.atomic(): for e in event_queue: insertIntoEvents(**e) + + +def call(jobs: List[Job]) -> None: + main(jobs) diff --git a/worker/worker/utils/custom_handlers.py b/worker/worker/utils/custom_handlers.py index 41ce4e99..d1541080 100644 --- a/worker/worker/utils/custom_handlers.py +++ b/worker/worker/utils/custom_handlers.py @@ -34,7 +34,7 @@ def emit(self, record): def get_script_logger( name, formatter=SCRIPT_FILE_FORMAT, root="archivematica", level=logging.INFO -): +) -> logging.Logger: logging_config = { "version": 1, "disable_existing_loggers": False, diff --git a/worker/worker/utils/databaseFunctions.py b/worker/worker/utils/databaseFunctions.py index b6c04c7d..2c8cd4f5 100644 --- a/worker/worker/utils/databaseFunctions.py +++ b/worker/worker/utils/databaseFunctions.py @@ -125,7 +125,7 @@ def insertIntoEvents( eventOutcome="", eventOutcomeDetailNote="", agents=None, -): +) -> Event: """Creates a new entry in the Events table using the supplied arguments. :param str fileUUID: The UUID of the file with which this event is