From 197c2b3285c8e773004e5678474b1a9e2ee4adf2 Mon Sep 17 00:00:00 2001 From: Florian Valeye Date: Mon, 8 Nov 2021 18:27:40 +0100 Subject: [PATCH] Feature/add directory scan for ContentFileScanner. (#4) * Add async method for external column names scanner (#3) * Add scan directory for the ContentFilesScanner --- python/Cargo.toml | 2 +- python/docs/source/usage.rst | 6 +++--- python/metadata_guardian/scanner.py | 23 +++++++++++++++++++++-- python/tests/test_data_rules.py | 4 ++-- python/tests/test_scanner.py | 16 +++++++++++++++- 5 files changed, 42 insertions(+), 9 deletions(-) diff --git a/python/Cargo.toml b/python/Cargo.toml index c084d55..496aaa1 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "metadata_guardian-python" -version = "0.0.8" +version = "0.0.9" authors = ["Florian Valeye "] homepage = "https://fvaleye.github.io/metadata-guardian/python" license = "Apache-2.0" diff --git a/python/docs/source/usage.rst b/python/docs/source/usage.rst index fe6102e..34def7c 100644 --- a/python/docs/source/usage.rst +++ b/python/docs/source/usage.rst @@ -64,10 +64,10 @@ Scan the column names of a local source: Scan content of a file: ->>> from metadata_guardian import DataRules, ContentFileScanner, AvailableCategory +>>> from metadata_guardian import DataRules, ContentFilesScanner, AvailableCategory >>> >>> data_rules = DataRules.from_available_category(category=AvailableCategory.PII) ->>> column_scanner = ContentFileScanner(data_rules=data_rules) ->>> report = column_scanner.scan_local_file(path="path") +>>> content_file_scanner = ContentFilesScanner(data_rules=data_rules) +>>> report = content_file_scanner.scan_local_file(path="path") >>> report.to_console() diff --git a/python/metadata_guardian/scanner.py b/python/metadata_guardian/scanner.py index a3b961d..7ca7827 100644 --- a/python/metadata_guardian/scanner.py +++ b/python/metadata_guardian/scanner.py @@ -1,4 +1,5 @@ import asyncio +import os from abc import ABC, abstractmethod from dataclasses import dataclass from typing import Optional @@ -203,7 +204,7 @@ async def async_validate_words(table_name: str) -> ReportResults: @dataclass -class ContentFileScanner: +class ContentFilesScanner: """Content Files Scanner instance.""" data_rules: DataRules @@ -212,7 +213,7 @@ def scan_local_file(self, path: str) -> MetadataGuardianReport: """ Scan a file with data rules. :param path: the path of the file to scan - :return: Metadata Guardian report + :return: a Metadata Guardian report """ return MetadataGuardianReport( report_results=[ @@ -221,3 +222,21 @@ def scan_local_file(self, path: str) -> MetadataGuardianReport: ) ] ) + + def scan_directory( + self, directory_path: str, file_names_extension: str + ) -> MetadataGuardianReport: + """ + Scan all the files inside directory path with the file name extension. + :param directory_path: the directory path to scan + :param file_names_extension: the file name extension to include (without the .) + :return: a Metadata Guardian report + """ + report = MetadataGuardianReport() + for root, dirs, files in os.walk(directory_path): + for name in files: + if name.endswith(f".{file_names_extension}"): + report.append( + other_report=self.scan_local_file(path=f"{root}/{name}") + ) + return report diff --git a/python/tests/test_data_rules.py b/python/tests/test_data_rules.py index 943ee54..184015f 100644 --- a/python/tests/test_data_rules.py +++ b/python/tests/test_data_rules.py @@ -1,7 +1,7 @@ import pytest from metadata_guardian.data_rules import AvailableCategory, DataRules -from metadata_guardian.scanner import ColumnScanner, ContentFileScanner +from metadata_guardian.scanner import ColumnScanner, ContentFilesScanner from metadata_guardian.source.local.avro_schema_source import AvroSchemaSource @@ -49,7 +49,7 @@ def test_get_data_rules_from_category_inclusion_no_violation(local_file): def test_get_data_rules_from_category_inclusion_violation_content(local_file): data_rules = DataRules.from_available_category(category=AvailableCategory.INCLUSION) - md_results = ContentFileScanner(data_rules=data_rules).scan_local_file(local_file) + md_results = ContentFilesScanner(data_rules=data_rules).scan_local_file(local_file) assert len(md_results.report_results[0].results) == 1 assert "resources/inclusion_violation.txt" in md_results.report_results[0].source diff --git a/python/tests/test_scanner.py b/python/tests/test_scanner.py index 712626a..47fcd1f 100644 --- a/python/tests/test_scanner.py +++ b/python/tests/test_scanner.py @@ -1,9 +1,10 @@ import asyncio +import os from unittest.mock import patch from metadata_guardian.data_rules import AvailableCategory, DataRules from metadata_guardian.report import MetadataGuardianReport, ReportResults -from metadata_guardian.scanner import ColumnScanner +from metadata_guardian.scanner import ColumnScanner, ContentFilesScanner from metadata_guardian.source.external.snowflake_source import SnowflakeSource @@ -125,3 +126,16 @@ def test_column_scanner_database_name_async(mock_connection): ) assert report == expected + + +def test_local_directory_scan(): + directory_path = os.path.join(os.path.dirname(__file__), "resources") + file_names_extension = "txt" + + data_rules = DataRules.from_available_category(category=AvailableCategory.INCLUSION) + + report = ContentFilesScanner(data_rules=data_rules).scan_directory( + directory_path=directory_path, file_names_extension=file_names_extension + ) + + assert "resources/inclusion_violation.txt" in str(report)