diff --git a/worker/pyproject.toml b/worker/pyproject.toml index d77b6df8..093cabc7 100644 --- a/worker/pyproject.toml +++ b/worker/pyproject.toml @@ -7,7 +7,7 @@ dependencies = [ "agentarchives>=0.9.0", "ammcpc>=0.2.0", "bagit", - "clamd>=1.0.2", + "clamav-client>=0.5.0", "django-autoslug>=1.9.9", "django-tastypie>=0.14.7", "gearman3", @@ -88,6 +88,7 @@ warn_unused_configs = true [[tool.mypy.overrides]] module = [ "worker.client.*", + "worker.clientScripts.archivematica_clamscan", "worker.clientScripts.characterize_file", "worker.clientScripts.has_packages", "worker.clientScripts.identify_file_format", @@ -96,6 +97,7 @@ module = [ "worker.clientScripts.transcribe_file", "worker.clientScripts.validate_file", "worker.tests.conftest", + "worker.tests.test_antivirus", "worker.tests.test_characterize_file", "worker.tests.test_has_packages", "worker.tests.test_identify_file_format", diff --git a/worker/tests/test_antivirus.py b/worker/tests/test_antivirus.py index 6d0e883a..1d6089b3 100644 --- a/worker/tests/test_antivirus.py +++ b/worker/tests/test_antivirus.py @@ -1,199 +1,26 @@ -"""Tests for the archivematica_clamscan.py client script.""" - -from collections import OrderedDict -from collections import namedtuple +from unittest import mock import pytest -from tests import test_antivirus_clamdscan +from worker.client.job import Job from worker.clientScripts import archivematica_clamscan - - -def test_get_scanner(settings): - """Test that get_scanner returns the correct instance of antivirus - per the user's configuration. Test return of clamdscanner by default.""" - - # Ensure that environment settings are available to the mock classes. - test_antivirus_clamdscan.setup_clamdscanner(settings) - - # Testing to ensure clamscanner is returned when explicitly set. - settings.CLAMAV_CLIENT_BACKEND = "clamscanner" - scanner = archivematica_clamscan.get_scanner() - assert isinstance(scanner, archivematica_clamscan.ClamScanner) - - # Testing to ensure that clamdscanner is returned when explicitly set. - settings.CLAMAV_CLIENT_BACKEND = "clamdscanner" - scanner = archivematica_clamscan.get_scanner() - assert isinstance(scanner, archivematica_clamscan.ClamdScanner) - - # Testing to ensure that clamdscanner is the default returned scanner. - settings.CLAMAV_CLIENT_BACKEND = "fprot" - scanner = archivematica_clamscan.get_scanner() - assert isinstance(scanner, archivematica_clamscan.ClamdScanner) - - # Testing to ensure that clamdscanner is the default returned scanner when - # the user configures an empty string. - settings.CLAMAV_CLIENT_BACKEND = "" - scanner = archivematica_clamscan.get_scanner() - assert isinstance(scanner, archivematica_clamscan.ClamdScanner) - - # Testing to ensure that clamdscanner is returned when the environment - # hasn't been configured appropriately and None is returned. - settings.CLAMAV_CLIENT_BACKEND = None - scanner = archivematica_clamscan.get_scanner() - assert isinstance(scanner, archivematica_clamscan.ClamdScanner) - - # Testing to ensure that clamdscanner is returned when another variable - # type is specified, e.g. in this instance, an integer. - settings.CLAMAV_CLIENT_BACKEND = 10 - scanner = archivematica_clamscan.get_scanner() - assert isinstance(scanner, archivematica_clamscan.ClamdScanner) - - -args = OrderedDict() -args["file_uuid"] = "ec26199f-72a4-4fd8-a94a-29144b02ddd8" -args["path"] = "/path" -args["date"] = "2019-12-01" -args["task_uuid"] = "c380e94e-7a7b-4ab8-aa72-ec0644cc3f5d" - - -class FileMock: - def __init__(self, size): - self.size = size - - -class ScannerMock(archivematica_clamscan.ScannerBase): - PROGRAM = "Mock" - - def __init__(self, should_except=False, passed=False): - self.should_except = should_except - self.passed = passed - - def scan(self, path): - if self.should_except: - raise Exception("Something really bad happened!") - return self.passed, None, None - - def version_attrs(self): - return ("version", "virus_definitions") - - -def setup_test_scan_file_mocks( - mocker, - file_already_scanned=False, - file_size=1024, - scanner_should_except=False, - scanner_passed=False, -): - deps = namedtuple("deps", ["file_already_scanned", "file_get", "scanner"])( - file_already_scanned=mocker.patch( - "worker.clientScripts.archivematica_clamscan.file_already_scanned", - return_value=file_already_scanned, - ), - file_get=mocker.patch( - "worker.main.models.File.objects.get", return_value=FileMock(size=file_size) - ), - scanner=ScannerMock(should_except=scanner_should_except, passed=scanner_passed), - ) - - mocker.patch( - "worker.clientScripts.archivematica_clamscan.get_scanner", - return_value=deps.scanner, +from worker.main.models import File + + +@pytest.mark.django_db +def test_antivirus(sip_file: File) -> None: + job = mock.Mock( + args=[ + "archivematica_clamscan.py", + str(sip_file.pk), + "path", + "date", + "task_uuid_not_used", + ], + JobContext=mock.MagicMock(), + spec=Job, ) - return deps - - -def test_scan_file_already_scanned(mocker): - deps = setup_test_scan_file_mocks(mocker, file_already_scanned=True) - - exit_code = archivematica_clamscan.scan_file([], **dict(args)) - - assert exit_code == 0 - deps.file_already_scanned.assert_called_once_with(args["file_uuid"]) - - -QueueEventParams = namedtuple("QueueEventParams", ["scanner_is_None", "passed"]) - - -@pytest.mark.parametrize( - "setup_kwargs, exit_code, queue_event_params", - [ - # File size too big for given file_size param - ( - {"file_size": 43, "scanner_passed": None}, - 0, - QueueEventParams(scanner_is_None=None, passed=None), - ), - # File size too big for given file_scan param - ( - {"file_size": 85, "scanner_passed": None}, - 0, - QueueEventParams(scanner_is_None=None, passed=None), - ), - # File size within given file_size param, and file_scan param - ( - {"file_size": 42, "scanner_passed": True}, - 0, - QueueEventParams(scanner_is_None=False, passed=True), - ), - # Scan returns None with no-error, e.g. Broken Pipe - ( - {"scanner_passed": None}, - 0, - QueueEventParams(scanner_is_None=None, passed=None), - ), - # Zero byte file passes - ( - {"file_size": 0, "scanner_passed": True}, - 0, - QueueEventParams(scanner_is_None=False, passed=True), - ), - # Virus found - ( - {"scanner_passed": False}, - 1, - QueueEventParams(scanner_is_None=False, passed=False), - ), - # Passed - ( - {"scanner_passed": True}, - 0, - QueueEventParams(scanner_is_None=False, passed=True), - ), - ], -) -def test_scan_file(mocker, setup_kwargs, exit_code, queue_event_params, settings): - setup_test_scan_file_mocks(mocker, **setup_kwargs) - - # Here the user configurable thresholds for maimum file size, and maximum - # scan size are being tested. The scan size is offset so as to enable the - # test to fall through correctly and eventually return None for - # not-scanned. - settings.CLAMAV_CLIENT_MAX_FILE_SIZE = 42 - settings.CLAMAV_CLIENT_MAX_SCAN_SIZE = 84 - - event_queue = [] - - ret = archivematica_clamscan.scan_file(event_queue, **dict(args)) - - # The integer returned by scan_file() is going to be used as the exit code - # of the archivematica_clamscan.py script which is important for the AM - # workflow in order to control what to do next. - assert exit_code == ret - - # A side effect of scan_file() is to queue an event to be created in the - # database. - if queue_event_params.passed is None: - assert len(event_queue) == 0 - else: - assert len(event_queue) == 1 + archivematica_clamscan.call([job]) - event = event_queue[0] - assert event["eventType"] == "virus check" - assert event["fileUUID"] == args["file_uuid"] - assert ( - event["eventOutcome"] == "Pass" - if setup_kwargs["scanner_passed"] - else "Fail" - ) + job.set_status.assert_called_once_with(1) diff --git a/worker/tests/test_antivirus_clamdscan.py b/worker/tests/test_antivirus_clamdscan.py deleted file mode 100644 index cb978f8c..00000000 --- a/worker/tests/test_antivirus_clamdscan.py +++ /dev/null @@ -1,159 +0,0 @@ -"""Tests for the archivematica_clamscan.py client script.""" - -import errno -from collections import namedtuple - -from clamd import BufferTooLongError -from clamd import ClamdNetworkSocket -from clamd import ClamdUnixSocket -from clamd import ConnectionError - -from worker.clientScripts import archivematica_clamscan - - -def setup_clamdscanner( - settings, addr="/var/run/clamav/clamd.ctl", timeout=10, stream=False -): - settings.CLAMAV_SERVER = addr - settings.CLAMAV_CLIENT_TIMEOUT = timeout - settings.CLAMAV_PASS_BY_STREAM = stream - - return archivematica_clamscan.ClamdScanner() - - -def test_clamdscanner_version_props(mocker, settings): - scanner = setup_clamdscanner(settings) - mocker.patch.object( - scanner, - "version_attrs", - return_value=("ClamAV 0.99.2", "23992/Fri Oct 27 05:04:12 2017"), - ) - - assert scanner.program() == "ClamAV (clamd)" - assert scanner.version() == "ClamAV 0.99.2" - assert scanner.virus_definitions() == "23992/Fri Oct 27 05:04:12 2017" - - -def test_clamdscanner_version_attrs(mocker, settings): - scanner = setup_clamdscanner(settings, addr="/var/run/clamav/clamd.ctl") - version = mocker.patch.object( - scanner.client, - "version", - return_value="ClamAV 0.99.2/23992/Fri Oct 27 05:04:12 2017", - ) - - assert scanner.version_attrs() == ( - "ClamAV 0.99.2", - "23992/Fri Oct 27 05:04:12 2017", - ) - version.assert_called_once() - - -def test_clamdscanner_get_client(settings): - scanner = setup_clamdscanner(settings, addr="/var/run/clamav/clamd.ctl") - assert isinstance(scanner.client, ClamdUnixSocket) - - scanner = setup_clamdscanner(settings, addr="127.0.0.1:1234", timeout=15.5) - assert isinstance(scanner.client, ClamdNetworkSocket) - assert scanner.client.host == "127.0.0.1" - assert scanner.client.port == 1234 - assert scanner.client.timeout == 15.5 - - -def test_clamdscanner_scan(mocker, settings): - OKAY_RET = ("OK", None) - ERROR_RET = ("ERROR", "Permission denied") - FOUND_RET = ("FOUND", "Eicar-Test-Signature") - - def patch(scanner, ret=OKAY_RET, excepts=False): - """Patch the scanner function and enable testing of exceptions raised - by clamdscanner that we want to control. excepts can take an argument - of True to pass a generic exception. excepts can also take an exception - as an argument for better granularity. - """ - deps = namedtuple("deps", ["pass_by_stream", "pass_by_reference"])( - pass_by_stream=mocker.patch.object( - scanner, "pass_by_stream", return_value={"stream": ret} - ), - pass_by_reference=mocker.patch.object( - scanner, "pass_by_reference", return_value={"/file": ret} - ), - ) - if excepts is not False: - e = excepts - if excepts is True: - e = Exception("Testing an unmanaged exception.") - deps.pass_by_stream.side_effect = e - deps.pass_by_reference.side_effect = e - return deps - - scanner = setup_clamdscanner(settings, stream=False) - deps = patch(scanner, ret=OKAY_RET) - passed, state, details = scanner.scan("/file") - assert passed is True - assert state == "OK" - assert details is None - deps.pass_by_stream.assert_not_called() - deps.pass_by_reference.assert_called_once() - - scanner = setup_clamdscanner(settings, stream=True) - deps = patch(scanner, ret=OKAY_RET) - passed, state, details = scanner.scan("/file") - assert passed is True - assert state == "OK" - assert details is None - deps.pass_by_stream.assert_called_once() - deps.pass_by_reference.assert_not_called() - - patch(scanner, ret=ERROR_RET) - passed, state, details = scanner.scan("/file") - assert passed is False - assert state == "ERROR" - assert details == "Permission denied" - - patch(scanner, ret=FOUND_RET) - passed, state, details = scanner.scan("/file") - assert passed is False - assert state == "FOUND" - assert details == "Eicar-Test-Signature" - - # Testing a generic Exception returned by the clamdscan micorservice. - patch(scanner, ret=OKAY_RET, excepts=True) - passed, state, details = scanner.scan("/file") - assert passed is False - assert state is None - assert details is None - - # Testing a generic IOError that is not a broken pipe error that we're - # expecting to be able to manage from clamdscan. - patch(scanner, ret=OKAY_RET, excepts=OSError("Testing a generic IO Error")) - passed, state, details = scanner.scan("/file") - assert passed is False - assert state is None - assert details is None - - # Broken pipe is a known error from the clamd library. - brokenpipe_error = OSError("Testing a broken pipe error") - brokenpipe_error.errno = errno.EPIPE - patch(scanner, ret=OKAY_RET, excepts=brokenpipe_error) - passed, state, details = scanner.scan("/file") - assert passed is None - assert state is None - assert details is None - - # The INSTREAM size limit error is known to us; test it here. - instream_error = BufferTooLongError("INSTREAM size limit exceeded. ERROR.") - patch(scanner, ret=OKAY_RET, excepts=instream_error) - passed, state, details = scanner.scan("/file") - assert passed is None - assert state is None - assert details is None - - # The clamd library can return a further error code here, and we we test it - # to make sure that if it does, it is managed. - connection_error = ConnectionError("Error while reading from socket.") - patch(scanner, ret=OKAY_RET, excepts=connection_error) - passed, state, details = scanner.scan("/file") - assert passed is None - assert state is None - assert details is None diff --git a/worker/tests/test_antivirus_clamscan.py b/worker/tests/test_antivirus_clamscan.py deleted file mode 100644 index 6a5deaa1..00000000 --- a/worker/tests/test_antivirus_clamscan.py +++ /dev/null @@ -1,79 +0,0 @@ -"""Tests for the archivematica_clamscan.py client script.""" - -import subprocess - -import pytest - -from worker.clientScripts import archivematica_clamscan - - -@pytest.mark.parametrize( - "version, want", - [ - ( - "ClamAV 0.99.2/23992/Fri Oct 27 05:04:12 2017", - ("ClamAV 0.99.2", "23992/Fri Oct 27 05:04:12 2017"), - ), - ("ClamAV 0.99.2", ("ClamAV 0.99.2", None)), - ("Unexpected value", (None, None)), - ], -) -def test_clamav_version_parts(version, want): - got = archivematica_clamscan.clamav_version_parts(version) - assert got == want - - -def setup_clamscanner(): - return archivematica_clamscan.ClamScanner() - - -def test_clamscanner_version_props(mocker): - scanner = setup_clamscanner() - mocker.patch.object( - scanner, - "version_attrs", - return_value=("ClamAV 0.99.2", "23992/Fri Oct 27 05:04:12 2017"), - ) - - assert scanner.program() == "ClamAV (clamscan)" - assert scanner.version() == "ClamAV 0.99.2" - assert scanner.virus_definitions() == "23992/Fri Oct 27 05:04:12 2017" - - -def test_clamscanner_version_attrs(mocker, settings): - scanner = setup_clamscanner() - mock = mocker.patch.object( - scanner, "_call", return_value="ClamAV 0.99.2/23992/Fri Oct 27 05:04:12 2017" - ) - - assert scanner.version_attrs() == ( - "ClamAV 0.99.2", - "23992/Fri Oct 27 05:04:12 2017", - ) - mock.assert_called_once_with("-V") - - -def test_clamscanner_scan(mocker, settings): - scanner = setup_clamscanner() - mock = mocker.patch.object(scanner, "_call", return_value="Output of clamscan") - - # User configured thresholds need to be sent through to clamscanner and - # executed as part of the call to it. - settings.CLAMAV_CLIENT_MAX_FILE_SIZE = 20 - settings.CLAMAV_CLIENT_MAX_SCAN_SIZE = 20 - - max_file_size = "--max-filesize=%dM" % settings.CLAMAV_CLIENT_MAX_FILE_SIZE - max_scan_size = "--max-scansize=%dM" % settings.CLAMAV_CLIENT_MAX_SCAN_SIZE - - assert scanner.scan("/file") == (True, "OK", None) - mock.assert_called_once_with(max_file_size, max_scan_size, "/file") - - mock.side_effect = subprocess.CalledProcessError( - 1, "clamscan", "Output of clamscan" - ) - assert scanner.scan("/file") == (False, "FOUND", None) - - mock.side_effect = subprocess.CalledProcessError( - 2, "clamscan", "Output of clamscan" - ) - assert scanner.scan("/file") == (False, "ERROR", None) diff --git a/worker/uv.lock b/worker/uv.lock index fd85fa95..73e83671 100644 --- a/worker/uv.lock +++ b/worker/uv.lock @@ -74,12 +74,12 @@ wheels = [ ] [[package]] -name = "clamd" -version = "1.0.2" +name = "clamav-client" +version = "0.5.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/14/8b/55332f1f79f28a5ccc50f66364087e64fae8e4ed62e52007ca82b3072221/clamd-1.0.2.tar.gz", hash = "sha256:d82a2fd814684a35a1b31feadafb2e69c8ebde9403613f6bdaa5d877c0f29560", size = 8218 } +sdist = { url = "https://files.pythonhosted.org/packages/30/9e/5f51bc9ba8a57e9bd33aa761103c00dcc3bdc36197d33d30f7b3f7efb218/clamav_client-0.5.0.tar.gz", hash = "sha256:48e0768f35d4f1b100f3f68ec5feb0b91e406a2ad8f262e0ac1ea1405ad727b3", size = 29632 } wheels = [ - { url = "https://files.pythonhosted.org/packages/3d/d0/84614de2a53ad52370adc9f9260bea420e53e0c228a248ec0eacfa65ccbb/clamd-1.0.2-py2.py3-none-any.whl", hash = "sha256:5c32546b7d1eb00fd6be00a889d79e00fbf980ed082826ccfa369bce3dcff5e7", size = 6684 }, + { url = "https://files.pythonhosted.org/packages/41/f6/39d8884462446d29e3b9e0bd652d8e7b2c016a52e1b4b787858f3c0991c5/clamav_client-0.5.0-py3-none-any.whl", hash = "sha256:199cf38f86fb435d26d9123c254c6cdc1e5b331816304f808f5c984e1f3e97c5", size = 13583 }, ] [[package]] @@ -481,7 +481,7 @@ dependencies = [ { name = "agentarchives" }, { name = "ammcpc" }, { name = "bagit" }, - { name = "clamd" }, + { name = "clamav-client" }, { name = "django" }, { name = "django-autoslug" }, { name = "django-tastypie" }, @@ -510,7 +510,7 @@ requires-dist = [ { name = "agentarchives", specifier = ">=0.9.0" }, { name = "ammcpc", specifier = ">=0.2.0" }, { name = "bagit", git = "https://github.com/artefactual-labs/bagit-python?rev=902051d8410219f6c5f4ce6d43e5b272cf29e89b" }, - { name = "clamd", specifier = ">=1.0.2" }, + { name = "clamav-client", specifier = ">=0.5.0" }, { name = "django", specifier = ">=4.2,<5" }, { name = "django-autoslug", specifier = ">=1.9.9" }, { name = "django-tastypie", specifier = ">=0.14.7" }, diff --git a/worker/worker/clientScripts/archivematica_clamscan.py b/worker/worker/clientScripts/archivematica_clamscan.py index 15085075..3a3cd1e4 100755 --- a/worker/worker/clientScripts/archivematica_clamscan.py +++ b/worker/worker/clientScripts/archivematica_clamscan.py @@ -15,26 +15,26 @@ # # You should have received a copy of the GNU General Public License # along with Archivematica. If not, see . -import abc import argparse -import errno +import dataclasses import multiprocessing import os -import re -import subprocess import uuid +from typing import Any +from typing import Dict +from typing import List +from typing import Optional import django -from clamd import BufferTooLongError -from clamd import ClamdNetworkSocket -from clamd import ClamdUnixSocket -from clamd import ConnectionError -from django.conf import settings as django_settings +from clamav_client import get_scanner +from clamav_client.scanner import Scanner +from django.conf import settings from django.core.exceptions import ValidationError from django.db import transaction django.setup() +from worker.client.job import Job from worker.main.models import Event from worker.main.models import File from worker.utils.custom_handlers import get_script_logger @@ -42,182 +42,22 @@ logger = get_script_logger("archivematica.worker.clamscan") +EventQueue = List[Dict] -def concurrent_instances(): - return multiprocessing.cpu_count() - - -def clamav_version_parts(ver): - """Both clamscan and clamd return a version string that looks like the - following:: - - ClamAV 0.99.2/23992/Fri Oct 27 05:04:12 2017 - - Given the example above, this function returns a tuple as follows:: - - ("ClamAV 0.99.2", "23992/Fri Oct 27 05:04:12 2017") - - Both elements may be None if the matching failed. - """ - parts = ver.split("/") - n = len(parts) - if n == 1: - version = parts[0] - if re.match("^ClamAV", version): - return version, None - elif n == 3: - version, defs, date = parts - return version, f"{defs}/{date}" - return None, None - - -class ScannerBase(metaclass=abc.ABCMeta): - @abc.abstractmethod - def scan(self, path): - """Scan a file and return a tuple of three elements reporting the - results. These are the three elements expected: - 1. passed (bool) - 2. state (str - "OK", "ERROR", or "FOUND") - 3. details (str - extra info when ERROR or FOUND) - """ - - @abc.abstractproperty - def version_attrs(self): - """Obtain the version details. It is expected to return a tuple of two - elements: ClamAV version number and virus definition version number. - The implementor can cache the results. - """ - - def program(self): - return self.PROGRAM - - def version(self): - return self.version_attrs()[0] - - def virus_definitions(self): - return self.version_attrs()[1] - - -class ClamdScanner(ScannerBase): - PROGRAM = "ClamAV (clamd)" - - def __init__(self): - self.addr = django_settings.CLAMAV_SERVER - self.timeout = django_settings.CLAMAV_CLIENT_TIMEOUT - self.stream = django_settings.CLAMAV_PASS_BY_STREAM - self.client = self.get_client() - - def scan(self, path): - if self.stream: - method_name = "pass_by_stream" - result_key = "stream" - else: - method_name = "pass_by_reference" - result_key = path - - passed, state, details = (False, None, None) - try: - result = getattr(self, method_name)(path) - state, details = result[result_key] - except Exception as err: - passed = ClamdScanner.clamd_exception_handler(err) - if state == "OK": - passed = True - return passed, state, details - - @staticmethod - def clamd_exception_handler(err): - """Manage each decision for an exception when it is raised. Ensure - that each decision can be tested to meet the documented Archivematica - antivirus feature definition. - """ - if isinstance(err, IOError): - if err.errno == errno.EPIPE: - logger.error( - "[Errno 32] Broken pipe. File not scanned. Check Clamd " - "StreamMaxLength" - ) - return None - elif isinstance(err, BufferTooLongError): - logger.error( - "Clamd BufferTooLongError. File not scanned. Check Clamd " - "StreamMaxLength" - ) - return None - elif isinstance(err, ConnectionError): - logger.error( - "Clamd ConnectionError. File not scanned. Check Clamd " "output: %s", - err, - ) - return None - # Return False and provide some information to the user for all other - # failures. - logger.error("Virus scanning failed: %s", err, exc_info=True) - return False - def version_attrs(self): - try: - self._version_attrs - except AttributeError: - self._version_attrs = clamav_version_parts(self.client.version()) - return self._version_attrs - - def get_client(self): - if ":" not in self.addr: - return ClamdUnixSocket(path=self.addr) - host, port = self.addr.split(":") - return ClamdNetworkSocket(host=host, port=int(port), timeout=self.timeout) - - def pass_by_reference(self, path): - logger.info( - "File being being read by Clamdscan from filesystem \ - reference." - ) - return self.client.scan(path) - - def pass_by_stream(self, path): - logger.info("File contents being streamed to Clamdscan.") - return self.client.instream(open(path, "rb")) +@dataclasses.dataclass +class Args: + file_uuid: str + path: str + date: str + task_uuid: str -class ClamScanner(ScannerBase): - PROGRAM = "ClamAV (clamscan)" - COMMAND = "clamscan" - - def _call(self, *args): - return subprocess.check_output((self.COMMAND,) + args) - - def scan(self, path): - passed, state, details = (False, "ERROR", None) - try: - max_file_size = ( - "--max-filesize=%dM" % django_settings.CLAMAV_CLIENT_MAX_FILE_SIZE - ) - max_scan_size = ( - "--max-scansize=%dM" % django_settings.CLAMAV_CLIENT_MAX_SCAN_SIZE - ) - self._call(max_file_size, max_scan_size, path) - except subprocess.CalledProcessError as err: - if err.returncode == 1: - state = "FOUND" - else: - logger.error("Virus scanning failed: %s", err.output, exc_info=True) - else: - passed, state = (True, "OK") - return passed, state, details - - def version_attrs(self): - try: - self._version_attrs - except AttributeError: - try: - self._version_attrs = clamav_version_parts(self._call("-V")) - except subprocess.CalledProcessError: - self._version_attrs = (None, None) - return self._version_attrs +def concurrent_instances() -> int: + return multiprocessing.cpu_count() -def file_already_scanned(file_uuid): +def file_already_scanned(file_uuid: str) -> bool: return ( file_uuid != "None" and Event.objects.filter( @@ -226,17 +66,21 @@ def file_already_scanned(file_uuid): ) -def queue_event(file_uuid, date, scanner, passed, queue): +def queue_event( + file_uuid: str, + date: str, + scanner: Scanner, + passed: Optional[bool], + queue: EventQueue, +) -> None: if passed is None or file_uuid == "None": return - event_detail = "" if scanner is not None: - event_detail = f'program="{scanner.program()}"; version="{scanner.version()}"; virusDefinitions="{scanner.virus_definitions()}"' - + info = scanner.info() + event_detail = f'program="{info.name}"; version="{info.version}"; virusDefinitions="{info.virus_definitions}"' outcome = "Pass" if passed else "Fail" logger.info("Recording new event for file %s (outcome: %s)", file_uuid, outcome) - queue.append( { "fileUUID": file_uuid, @@ -249,41 +93,7 @@ def queue_event(file_uuid, date, scanner, passed, queue): ) -def get_parser(): - """Return a ``Namespace`` with the parsed arguments.""" - parser = argparse.ArgumentParser() - parser.add_argument("file_uuid", metavar="fileUUID") - parser.add_argument("path", metavar="PATH", help="File or directory location") - parser.add_argument("date", metavar="DATE") - parser.add_argument( - "task_uuid", metavar="taskUUID", help="Currently unused, feel free to ignore." - ) - return parser - - -SCANNERS = (ClamScanner, ClamdScanner) -SCANNERS_NAMES = tuple(b.__name__.lower() for b in SCANNERS) -DEFAULT_SCANNER = ClamdScanner - - -def get_scanner(): - """Return the ClamAV client configured by the user and found in the - installation's environment variables. Clamdscanner may perform quicker - than Clamscanner given a larger number of objects. Return clamdscanner - object as a default if no other, or an incorrect value is specified. - """ - choice = str(django_settings.CLAMAV_CLIENT_BACKEND).lower() - if choice not in SCANNERS_NAMES: - logger.warning( - "Unexpected antivirus scanner (CLAMAV_CLIENT_BACKEND):" ' "%s"; using %s.', - choice, - DEFAULT_SCANNER.__name__, - ) - return DEFAULT_SCANNER() - return SCANNERS[SCANNERS_NAMES.index(choice)]() - - -def get_size(file_uuid, path): +def get_size(file_uuid: str, path: str) -> Optional[int]: # We're going to see this happening when files are not part of `objects/`. if file_uuid != "None": try: @@ -297,78 +107,128 @@ def get_size(file_uuid, path): return None -def scan_file(event_queue, file_uuid, path, date, task_uuid): - if file_already_scanned(file_uuid): - logger.info("Virus scan already performed, not running scan again") - return 0 +def validate_max_settings( + size: int, max_file_size: float, max_scan_size: float +) -> bool: + max_file_size = max_file_size * 1024 * 1024 + max_scan_size = max_scan_size * 1024 * 1024 + if size > max_file_size: + logger.info( + "File will not be scanned. Size %s bytes greater than scanner " + "max file size %s bytes", + size, + max_file_size, + ) + return False + elif size > max_scan_size: + logger.info( + "File will not be scanned. Size %s bytes greater than scanner " + "max scan size %s bytes", + size, + max_scan_size, + ) + return False + return True - scanner, passed = None, False +def scan_file( + scanner: Scanner, + event_queue: EventQueue, + opts: Args, +) -> int: + if file_already_scanned(opts.file_uuid): + logger.info("Virus scan already performed, not running scan again") + return 0 + passed: Optional[bool] = False try: - size = get_size(file_uuid, path) + size = get_size(opts.file_uuid, opts.path) if size is None: logger.error("Getting file size returned: %s", size) return 1 - - max_file_size = django_settings.CLAMAV_CLIENT_MAX_FILE_SIZE * 1024 * 1024 - max_scan_size = django_settings.CLAMAV_CLIENT_MAX_SCAN_SIZE * 1024 * 1024 - - valid_scan = True - - if size > max_file_size: - logger.info( - "File will not be scanned. Size %s bytes greater than scanner " - "max file size %s bytes", - size, - max_file_size, - ) - valid_scan = False - elif size > max_scan_size: - logger.info( - "File will not be scanned. Size %s bytes greater than scanner " - "max scan size %s bytes", - size, - max_scan_size, - ) - valid_scan = False - + valid_scan = validate_max_settings( + size, + settings.CLAMAV_CLIENT_MAX_FILE_SIZE, + settings.CLAMAV_CLIENT_MAX_SCAN_SIZE, + ) if valid_scan: scanner = get_scanner() + info = scanner.info() logger.info( "Using scanner %s (%s - %s)", - scanner.program(), - scanner.version(), - scanner.virus_definitions(), + info.name, + info.version, + info.virus_definitions, ) - - passed, state, details = scanner.scan(path) + result = scanner.scan(opts.path) + passed = result.state is not None + state = result.state + details = result.details else: passed, state, details = None, None, None - except Exception: - logger.error("Unexpected error scanning file %s", path, exc_info=True) + logger.error("Unexpected error scanning file %s", opts.path, exc_info=True) return 1 else: # record pass or fail, but not None if the file hasn't # been scanned, e.g. Max File Size thresholds being too low. if passed is not None: - logger.info("File %s scanned!", path) + logger.info("File %s scanned!", opts.path) logger.debug("passed=%s state=%s details=%s", passed, state, details) finally: - queue_event(file_uuid, date, scanner, passed, event_queue) + queue_event(opts.file_uuid, opts.date, scanner, passed, event_queue) # If True or None, then we have no error, the file can move through the # process as expected... return 1 if passed is False else 0 -def call(jobs): - event_queue = [] +def build_scanner(settings) -> Scanner: + backend = settings.CLAMAV_CLIENT_BACKEND.lower() + if backend == "clamdscanner": + backend = "clamd" + elif backend == "clamscanner": + backend = "clamscan" + config: Any = { + "backend": backend, + "address": settings.CLAMAV_SERVER, + "timeout": settings.CLAMAV_CLIENT_TIMEOUT, + "stream": settings.CLAMAV_PASS_BY_STREAM, + "max_file_size": settings.CLAMAV_CLIENT_MAX_FILE_SIZE, + "max_scan_size": settings.CLAMAV_CLIENT_MAX_SCAN_SIZE, + } + return get_scanner(config) + + +def get_parser() -> argparse.ArgumentParser: + """Return a ``Namespace`` with the parsed arguments.""" + parser = argparse.ArgumentParser() + parser.add_argument("file_uuid", metavar="fileUUID") + parser.add_argument("path", metavar="PATH", help="File or directory location") + parser.add_argument("date", metavar="DATE") + parser.add_argument( + "task_uuid", metavar="taskUUID", help="Currently unused, feel free to ignore." + ) + return parser + + +def parse_args(parser: argparse.ArgumentParser, job: Job) -> Args: + namespace = parser.parse_args(job.args[1:]) + return Args(**vars(namespace)) + + +def main(jobs: List[Job]) -> None: + parser = get_parser() + event_queue: EventQueue = [] + scanner = build_scanner(settings) for job in jobs: with job.JobContext(logger=logger): - job.set_status(scan_file(event_queue, *job.args[1:])) - + opts = parse_args(parser, job) + job.set_status(scan_file(scanner, event_queue, opts)) with transaction.atomic(): for e in event_queue: insertIntoEvents(**e) + + +def call(jobs) -> None: + main(jobs)