From 4e878cbb3db62d65f62d3ae6d7af9586859f5fd0 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Wed, 12 Oct 2022 16:30:58 +0200 Subject: [PATCH 01/50] Implement a download utility Signed-off-by: Bram Stoeller --- pyproject.toml | 1 + src/power_grid_model_io/utils/download.py | 132 ++++++++++++++++++++++ 2 files changed, 133 insertions(+) create mode 100644 src/power_grid_model_io/utils/download.py diff --git a/pyproject.toml b/pyproject.toml index f6e04ad7..d348bf3c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ dependencies = [ "power_grid_model>=1.4", "pyyaml", "structlog", + "tqdm", ] dynamic = ["version"] diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py new file mode 100644 index 00000000..29873e66 --- /dev/null +++ b/src/power_grid_model_io/utils/download.py @@ -0,0 +1,132 @@ +# SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project +# +# SPDX-License-Identifier: MPL-2.0 +""" +Helper functions to download (and store) files from the internet +""" + +import hashlib +from pathlib import Path +from tempfile import gettempdir +from typing import ByteString, Callable, Optional +from urllib import request + +import structlog +import tqdm + + +def download( + url: str, file_path: Optional[Path] = None, dir_path: Optional[Path] = None, overwrite: bool = False +) -> Path: + """ + Download a file from a URL and store it locally + + Args: + url: The url to the file + file_path: An optional file path (absolute or relative to dir_path). If no file_path is given, a file name is + generated based on the url + dir_path: An optional dir path to store the downloaded file. If no dir_path is given the system's temp dir will + be used. Use Path(".") if you intend to use the current working directory. + overwrite: Should we download the file, even if we have downloaded already (and the file size still matches)? + + Returns: + The path to the downloaded file + """ + log = structlog.get_logger(__name__) + file_path = get_download_path(dir_path=dir_path, file_path=file_path, data=url.encode()) + + if file_path.is_file(): + if overwrite: + log.debug("Forced re-downloading existing file", url=url, file_path=file_path) + else: + current_size = get_url_size(url=url) + previous_size = file_path.stat().st_size + if previous_size == current_size: + log.debug("Skip downloading existing file", url=url, file_path=file_path) + return file_path + log.debug( + "Re-downloading existing file, because the size has changed", + url=url, + file_path=file_path, + previous_size=previous_size, + current_size=current_size, + ) + else: + log.debug("Downloading file", url=url, file_path=file_path) + + def progress_hook(progress_bar: tqdm.tqdm) -> Callable[[int, int, int], None]: + last_block = [0] + + def update_progress_bar(block_num: int, block_size: int, file_size: int) -> None: + if file_size > 0: + progress_bar.total = file_size + progress_bar.update((block_num - last_block[0]) * block_size) + last_block[0] = block_num + + return update_progress_bar + + # Download to a temp file first, so the results are not stored if the transfer fails + with tqdm.tqdm(unit="B", unit_scale=True, desc=url, leave=True) as progress_bar: + temp_file, _headers = request.urlretrieve(url, reporthook=progress_hook(progress_bar)) + + # Check if the file contains any content + temp_path = Path(temp_file) + if temp_path.stat().st_size == 0: + log.warning("Downloaded an empty file", url=url, file_path=file_path) + + # Move the file to it's final destination + file_path.unlink(missing_ok=True) + temp_path.rename(file_path) + log.debug("Downloaded file", url=url, file_path=file_path, file_size=file_path.stat().st_size) + + return file_path + + +def get_url_size(url: str) -> int: + """ + Retrieve the file size of a given URL (based on it's header) + + Args: + url: The url to the file + + Return: + The file size in bytes + """ + return int(request.urlopen(url).headers["Content-Length"]) + + +def get_download_path(dir_path: Optional[Path], file_path: Optional[Path], data=Optional[ByteString]) -> Path: + """ + Determine the file path based on dir_path, file_path and/or data + + Args: + dir_path: An optional dir path to store the downloaded file. If no dir_path is given the system's temp dir will + be used. Use Path(".") if you intend to use the current working directory. + file_path: An optional file path (absolute or relative to dir_path). If no file_path is given, a file name is + generated based on the url + data: A bytestring that can be used to generate a filename. + """ + # If no file_path is given, generate a file name + if file_path is None: + if data is None: + raise ValueError(f"Can't auto generate a file name for a {type(data).__name__}.") + try: + md5 = hashlib.md5() + md5.update(data) + hash_str = md5.hexdigest() + except (TypeError, ValueError) as ex: + raise ValueError(f"Can't auto generate a file name for a {type(data).__name__}.") from ex + file_path = Path(f"{__name__}.{hash_str}") + + # If no dir_path is given, use the system's temp dir + if dir_path is None: + dir_path = Path(gettempdir()) + + # Prefix the file_path and allow special path characters like ~ + file_path = dir_path.expanduser() / file_path + + # If the file_path exists, it should be a file (not a dir) + if file_path.exists() and not file_path.is_file(): + raise ValueError(f"Invalid file path: {file_path}") + + return file_path.absolute() From 474d08af9d2cc3ccca1a7579bbe7a8413f51257a Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Fri, 14 Oct 2022 11:03:10 +0200 Subject: [PATCH 02/50] Add download_and_extract() Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 135 +++++++++++++++++----- 1 file changed, 106 insertions(+), 29 deletions(-) diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index 29873e66..2278ea3a 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -6,15 +6,41 @@ """ import hashlib +import re +import shutil +import zipfile from pathlib import Path -from tempfile import gettempdir -from typing import ByteString, Callable, Optional +from typing import ByteString, Callable, Optional, Tuple from urllib import request import structlog import tqdm +def download_and_extract( + url: str, file_path: Optional[Path] = None, dir_path: Optional[Path] = None, overwrite: bool = False +) -> Path: + """ + Download a file from a URL and store it locally, extract the contents and return the path to the contents. + + Args: + url: The url to the .zip file + file_path: An optional file path (absolute or relative to dir_path). If no file_path is given, a file name is + generated based on the url + dir_path: An optional dir path to store the downloaded file. If no dir_path is given the current working dir + will be used. + overwrite: Should we download the file, even if we have downloaded already (and the file size still matches)? + + Returns: + The path to the downloaded file + """ + src_file_path = download(url=url, file_path=file_path, dir_path=dir_path, overwrite=overwrite) + dst_dir_path = src_file_path.with_suffix("") + if overwrite and dst_dir_path.is_dir(): + shutil.rmtree(dst_dir_path) + return extract(src_file_path=src_file_path, dst_dir_path=dst_dir_path, skip_if_exists=not overwrite) + + def download( url: str, file_path: Optional[Path] = None, dir_path: Optional[Path] = None, overwrite: bool = False ) -> Path: @@ -25,34 +51,35 @@ def download( url: The url to the file file_path: An optional file path (absolute or relative to dir_path). If no file_path is given, a file name is generated based on the url - dir_path: An optional dir path to store the downloaded file. If no dir_path is given the system's temp dir will - be used. Use Path(".") if you intend to use the current working directory. + dir_path: An optional dir path to store the downloaded file. If no dir_path is given the current working dir + will be used. overwrite: Should we download the file, even if we have downloaded already (and the file size still matches)? Returns: The path to the downloaded file """ - log = structlog.get_logger(__name__) + remote_size, remote_file_name = get_url_size_and_file_name(url=url) + if file_path is None and remote_file_name: + file_path = Path(remote_file_name) + file_path = get_download_path(dir_path=dir_path, file_path=file_path, data=url.encode()) + log = structlog.get_logger(__name__).bind(url=url, file_path=file_path) if file_path.is_file(): if overwrite: - log.debug("Forced re-downloading existing file", url=url, file_path=file_path) + log.debug("Forced re-downloading existing file") else: - current_size = get_url_size(url=url) - previous_size = file_path.stat().st_size - if previous_size == current_size: - log.debug("Skip downloading existing file", url=url, file_path=file_path) + local_size = file_path.stat().st_size + if local_size == remote_size: + log.debug("Skip downloading existing file") return file_path log.debug( "Re-downloading existing file, because the size has changed", - url=url, - file_path=file_path, - previous_size=previous_size, - current_size=current_size, + local_size=local_size, + remote_size=remote_size, ) else: - log.debug("Downloading file", url=url, file_path=file_path) + log.debug("Downloading file") def progress_hook(progress_bar: tqdm.tqdm) -> Callable[[int, int, int], None]: last_block = [0] @@ -66,23 +93,24 @@ def update_progress_bar(block_num: int, block_size: int, file_size: int) -> None return update_progress_bar # Download to a temp file first, so the results are not stored if the transfer fails - with tqdm.tqdm(unit="B", unit_scale=True, desc=url, leave=True) as progress_bar: + with tqdm.tqdm(desc=url, unit="B", unit_scale=True, leave=True) as progress_bar: temp_file, _headers = request.urlretrieve(url, reporthook=progress_hook(progress_bar)) # Check if the file contains any content temp_path = Path(temp_file) if temp_path.stat().st_size == 0: - log.warning("Downloaded an empty file", url=url, file_path=file_path) + log.warning("Downloaded an empty file") # Move the file to it's final destination + file_path.parent.mkdir(parents=True, exist_ok=True) file_path.unlink(missing_ok=True) temp_path.rename(file_path) - log.debug("Downloaded file", url=url, file_path=file_path, file_size=file_path.stat().st_size) + log.debug("Downloaded file", file_size=file_path.stat().st_size) return file_path -def get_url_size(url: str) -> int: +def get_url_size_and_file_name(url: str) -> Tuple[int, str]: """ Retrieve the file size of a given URL (based on it's header) @@ -92,7 +120,13 @@ def get_url_size(url: str) -> int: Return: The file size in bytes """ - return int(request.urlopen(url).headers["Content-Length"]) + with request.urlopen(url) as context: + headers = context.headers + file_size = int(headers.get("Content-Length", 0)) + matches = re.findall(r"filename=\"(.+)\"", headers.get("Content-Disposition", "")) + file_name = matches[0] if matches else None + + return file_size, file_name def get_download_path(dir_path: Optional[Path], file_path: Optional[Path], data=Optional[ByteString]) -> Path: @@ -106,27 +140,70 @@ def get_download_path(dir_path: Optional[Path], file_path: Optional[Path], data= generated based on the url data: A bytestring that can be used to generate a filename. """ + # If no file_path is given, generate a file name if file_path is None: - if data is None: - raise ValueError(f"Can't auto generate a file name for a {type(data).__name__}.") try: md5 = hashlib.md5() md5.update(data) hash_str = md5.hexdigest() except (TypeError, ValueError) as ex: raise ValueError(f"Can't auto generate a file name for a {type(data).__name__}.") from ex - file_path = Path(f"{__name__}.{hash_str}") + file_path = Path(__name__) / f"{hash_str}.download" - # If no dir_path is given, use the system's temp dir - if dir_path is None: - dir_path = Path(gettempdir()) + # If no dir_path is given, use current working dir + elif dir_path is None: + dir_path = Path(".") - # Prefix the file_path and allow special path characters like ~ - file_path = dir_path.expanduser() / file_path + # Combine the two paths + file_path = (dir_path / file_path).resolve() if dir_path else file_path.resolve() # If the file_path exists, it should be a file (not a dir) if file_path.exists() and not file_path.is_file(): raise ValueError(f"Invalid file path: {file_path}") - return file_path.absolute() + return file_path + + +def extract(src_file_path: Path, dst_dir_path: Optional[Path] = None, skip_if_exists=True) -> Path: + """ + Extract a .zip file and return the destination dir + + Args: + src_file_path: The .zip file to extract. + src_file_path: An optional destination path. If none is given, the src_file_path wihout .zip extension is used. + skip_if_exists: If true, it returns the dir path, otherwise raise an exception. + + Returns: The path where the files are extracted + + """ + if src_file_path.suffix.lower() != ".zip": + raise ValueError(f"Only files with .zip extension are supported, got {src_file_path}") + if dst_dir_path is None: + dst_dir_path = src_file_path.with_suffix("") + + log = structlog.get_logger(__name__).bind(src_file_path=src_file_path, dst_dir_path=dst_dir_path) + + if dst_dir_path.exists(): + if not dst_dir_path.is_dir(): + raise NotADirectoryError(f"Destination dir {dst_dir_path} exists and is not a directory") + if not skip_if_exists: + raise FileExistsError(f"Destination dir {dst_dir_path} exists and is not empty") + log.debug("Skip extraction, destiation dir exists") + return dst_dir_path + + # Create the destination directory + dst_dir_path.mkdir(parents=True, exist_ok=True) + + # Extract per file, so we can show a progress bar + with zipfile.ZipFile(src_file_path, "r") as zip_file: + file_list = zip_file.namelist() + for file_path in tqdm.tqdm(desc=str(src_file_path), iterable=file_list, total=len(file_list), unit="file"): + zip_file.extract(member=file_path, path=dst_dir_path) + + # If the zip files contains a single directory with the same name as the zip file, return that dir + contents = list(dst_dir_path.iterdir()) + if len(contents) == 1 and contents[0].is_dir() and contents[0].name == src_file_path.stem: + dst_dir_path = contents[0] + + return dst_dir_path From 1ed9f80fd35c8e729ebbdb634a18d2da49a36c0a Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Fri, 14 Oct 2022 19:40:51 +0200 Subject: [PATCH 03/50] Fix path bug for existing dirs Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 24 +++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index 2278ea3a..b637c0d9 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -14,7 +14,7 @@ from urllib import request import structlog -import tqdm +from tqdm import tqdm def download_and_extract( @@ -81,7 +81,7 @@ def download( else: log.debug("Downloading file") - def progress_hook(progress_bar: tqdm.tqdm) -> Callable[[int, int, int], None]: + def progress_hook(progress_bar: tqdm) -> Callable[[int, int, int], None]: last_block = [0] def update_progress_bar(block_num: int, block_size: int, file_size: int) -> None: @@ -93,7 +93,7 @@ def update_progress_bar(block_num: int, block_size: int, file_size: int) -> None return update_progress_bar # Download to a temp file first, so the results are not stored if the transfer fails - with tqdm.tqdm(desc=url, unit="B", unit_scale=True, leave=True) as progress_bar: + with tqdm(desc="Downloading", unit="B", unit_scale=True, leave=True) as progress_bar: temp_file, _headers = request.urlretrieve(url, reporthook=progress_hook(progress_bar)) # Check if the file contains any content @@ -190,16 +190,16 @@ def extract(src_file_path: Path, dst_dir_path: Optional[Path] = None, skip_if_ex if not skip_if_exists: raise FileExistsError(f"Destination dir {dst_dir_path} exists and is not empty") log.debug("Skip extraction, destiation dir exists") - return dst_dir_path - # Create the destination directory - dst_dir_path.mkdir(parents=True, exist_ok=True) - - # Extract per file, so we can show a progress bar - with zipfile.ZipFile(src_file_path, "r") as zip_file: - file_list = zip_file.namelist() - for file_path in tqdm.tqdm(desc=str(src_file_path), iterable=file_list, total=len(file_list), unit="file"): - zip_file.extract(member=file_path, path=dst_dir_path) + else: + # Create the destination directory + dst_dir_path.mkdir(parents=True, exist_ok=True) + + # Extract per file, so we can show a progress bar + with zipfile.ZipFile(src_file_path, "r") as zip_file: + file_list = zip_file.namelist() + for file_path in tqdm(desc="Extracting", iterable=file_list, total=len(file_list), unit="file", leave=True): + zip_file.extract(member=file_path, path=dst_dir_path) # If the zip files contains a single directory with the same name as the zip file, return that dir contents = list(dst_dir_path.iterdir()) From d2db4c77c49026da032b127a59f7e53c2831a0e9 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Fri, 14 Oct 2022 20:04:46 +0200 Subject: [PATCH 04/50] Switch dir_path and file_path in download_and_extract Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index b637c0d9..cb115740 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -18,17 +18,17 @@ def download_and_extract( - url: str, file_path: Optional[Path] = None, dir_path: Optional[Path] = None, overwrite: bool = False + url: str, dir_path: Optional[Path] = None, file_path: Optional[Path] = None, overwrite: bool = False ) -> Path: """ Download a file from a URL and store it locally, extract the contents and return the path to the contents. Args: url: The url to the .zip file - file_path: An optional file path (absolute or relative to dir_path). If no file_path is given, a file name is - generated based on the url dir_path: An optional dir path to store the downloaded file. If no dir_path is given the current working dir will be used. + file_path: An optional file path (absolute or relative to dir_path). If no file_path is given, a file name is + generated based on the url overwrite: Should we download the file, even if we have downloaded already (and the file size still matches)? Returns: From 029f4cd68d7067b4d95efaf710b0e21ce5d8a9b5 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Fri, 14 Oct 2022 22:05:26 +0200 Subject: [PATCH 05/50] Fix typo Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index cb115740..fa9550d7 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -189,7 +189,7 @@ def extract(src_file_path: Path, dst_dir_path: Optional[Path] = None, skip_if_ex raise NotADirectoryError(f"Destination dir {dst_dir_path} exists and is not a directory") if not skip_if_exists: raise FileExistsError(f"Destination dir {dst_dir_path} exists and is not empty") - log.debug("Skip extraction, destiation dir exists") + log.debug("Skip extraction, destination dir exists") else: # Create the destination directory From d2793e2eb036fd7b87bca677847e75dd58322ee3 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Fri, 14 Oct 2022 22:29:40 +0200 Subject: [PATCH 06/50] Get and check http status code Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index fa9550d7..e36ae70c 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -58,7 +58,10 @@ def download( Returns: The path to the downloaded file """ - remote_size, remote_file_name = get_url_size_and_file_name(url=url) + status_code, remote_size, remote_file_name = get_url_headers(url=url) + if status_code != 200: + raise IOError(f"Could not download from URL, status={status_code}") + if file_path is None and remote_file_name: file_path = Path(remote_file_name) @@ -110,7 +113,7 @@ def update_progress_bar(block_num: int, block_size: int, file_size: int) -> None return file_path -def get_url_size_and_file_name(url: str) -> Tuple[int, str]: +def get_url_headers(url: str) -> Tuple[int, int, str]: """ Retrieve the file size of a given URL (based on it's header) @@ -121,12 +124,13 @@ def get_url_size_and_file_name(url: str) -> Tuple[int, str]: The file size in bytes """ with request.urlopen(url) as context: + status_code = context.status headers = context.headers file_size = int(headers.get("Content-Length", 0)) matches = re.findall(r"filename=\"(.+)\"", headers.get("Content-Disposition", "")) file_name = matches[0] if matches else None - return file_size, file_name + return status_code, file_size, file_name def get_download_path(dir_path: Optional[Path], file_path: Optional[Path], data=Optional[ByteString]) -> Path: From 157a38937b2556c8fc081f24cb8061c83e57f459 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Wed, 26 Oct 2022 11:54:01 +0200 Subject: [PATCH 07/50] Use sha1 hashes instead of md5 Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index e36ae70c..1381e64c 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -148,9 +148,9 @@ def get_download_path(dir_path: Optional[Path], file_path: Optional[Path], data= # If no file_path is given, generate a file name if file_path is None: try: - md5 = hashlib.md5() - md5.update(data) - hash_str = md5.hexdigest() + sha1 = hashlib.sha1() + sha1.update(data) + hash_str = sha1.hexdigest() except (TypeError, ValueError) as ex: raise ValueError(f"Can't auto generate a file name for a {type(data).__name__}.") from ex file_path = Path(__name__) / f"{hash_str}.download" From e13ecf235f4150e55dacb8f0324888e8b68dfee8 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Wed, 26 Oct 2022 11:57:58 +0200 Subject: [PATCH 08/50] Use sha256 hashes instead of sha1 Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index 1381e64c..99ce500f 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -148,9 +148,9 @@ def get_download_path(dir_path: Optional[Path], file_path: Optional[Path], data= # If no file_path is given, generate a file name if file_path is None: try: - sha1 = hashlib.sha1() - sha1.update(data) - hash_str = sha1.hexdigest() + sha256 = hashlib.sha256() + sha256.update(data) + hash_str = sha256.hexdigest() except (TypeError, ValueError) as ex: raise ValueError(f"Can't auto generate a file name for a {type(data).__name__}.") from ex file_path = Path(__name__) / f"{hash_str}.download" From df2cc0744ce6e835b717e2206be855e3d9174f19 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Thu, 23 Feb 2023 13:55:43 +0100 Subject: [PATCH 09/50] Create DownloadProgressHook class + unit tests Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 51 ++++++++++++++------ tests/unit/utils/test_download.py | 57 +++++++++++++++++++++++ 2 files changed, 95 insertions(+), 13 deletions(-) create mode 100644 tests/unit/utils/test_download.py diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index 99ce500f..ca051221 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -10,13 +10,42 @@ import shutil import zipfile from pathlib import Path -from typing import ByteString, Callable, Optional, Tuple +from typing import ByteString, Optional, Tuple from urllib import request import structlog from tqdm import tqdm +class DownloadProgressHook: # pylint: disable=too-few-public-methods + """ + Report hook for request.urlretrieve() to update a progress bar based on the amount of downloaded blocks + """ + + def __init__(self, progress_bar: tqdm): + """ + Report hook for request.urlretrieve() to update a progress bar based on the amount of downloaded blocks + + Args: + progress_bar: A tqdb progress bar + """ + self._progress_bar = progress_bar + self._last_block = 0 + + def __call__(self, block_num: int, block_size: int, file_size: int) -> None: + """ + Args: + block_num: The last downloaded block number + block_size: The block size in bytes + file_size: The file size in bytes (may be 0 in the first call) + + """ + if file_size > 0: + self._progress_bar.total = file_size + self._progress_bar.update((block_num - self._last_block) * block_size) + self._last_block = block_num + + def download_and_extract( url: str, dir_path: Optional[Path] = None, file_path: Optional[Path] = None, overwrite: bool = False ) -> Path: @@ -30,14 +59,21 @@ def download_and_extract( file_path: An optional file path (absolute or relative to dir_path). If no file_path is given, a file name is generated based on the url overwrite: Should we download the file, even if we have downloaded already (and the file size still matches)? + Be careful with this option, as it will remove files from your drive irreversibly! Returns: The path to the downloaded file """ + + # Download the file and use the file name as the base name for the extraction directory src_file_path = download(url=url, file_path=file_path, dir_path=dir_path, overwrite=overwrite) dst_dir_path = src_file_path.with_suffix("") + + # If we explicitly want to overwrite the extracted files, remove the if overwrite and dst_dir_path.is_dir(): shutil.rmtree(dst_dir_path) + + # Extract the files and return the path of the extraction directory return extract(src_file_path=src_file_path, dst_dir_path=dst_dir_path, skip_if_exists=not overwrite) @@ -84,20 +120,9 @@ def download( else: log.debug("Downloading file") - def progress_hook(progress_bar: tqdm) -> Callable[[int, int, int], None]: - last_block = [0] - - def update_progress_bar(block_num: int, block_size: int, file_size: int) -> None: - if file_size > 0: - progress_bar.total = file_size - progress_bar.update((block_num - last_block[0]) * block_size) - last_block[0] = block_num - - return update_progress_bar - # Download to a temp file first, so the results are not stored if the transfer fails with tqdm(desc="Downloading", unit="B", unit_scale=True, leave=True) as progress_bar: - temp_file, _headers = request.urlretrieve(url, reporthook=progress_hook(progress_bar)) + temp_file, _headers = request.urlretrieve(url, reporthook=DownloadProgressHook(progress_bar)) # Check if the file contains any content temp_path = Path(temp_file) diff --git a/tests/unit/utils/test_download.py b/tests/unit/utils/test_download.py new file mode 100644 index 00000000..a35bd707 --- /dev/null +++ b/tests/unit/utils/test_download.py @@ -0,0 +1,57 @@ +# SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project +# +# SPDX-License-Identifier: MPL-2.0 +from unittest.mock import MagicMock + +from power_grid_model_io.utils.download import ( + DownloadProgressHook, + download, + download_and_extract, + extract, + get_download_path, + get_url_headers, +) + + +def test_progress_hook(): + # Arrange + progress_bar = MagicMock() + progress_bar.total = None + hook = DownloadProgressHook(progress_bar) + + # Act + hook(2, 10, 0) + + # Assert + assert progress_bar.total is None # total is not updated + progress_bar.update.assert_called_once_with((2 - 0) * 10) + + # Arrange + progress_bar.reset_mock() + + # Act + hook(5, 10, 123) + + # Assert + assert progress_bar.total == 123 # total is updated + progress_bar.update.assert_called_once_with((5 - 2) * 10) + + +def test_download_and_extract(): + download_and_extract + + +def test_download(): + download + + +def test_get_url_headers(): + get_url_headers + + +def test_get_download_path(): + get_download_path + + +def test_extract(): + extract From 8463f08ee2a44fead6bb9ec822d8c573c68764ed Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Thu, 23 Feb 2023 14:41:48 +0100 Subject: [PATCH 10/50] Unit tests for download_and_extract() Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 4 +- tests/unit/utils/test_download.py | 90 ++++++++++++++++++++--- 2 files changed, 82 insertions(+), 12 deletions(-) diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index ca051221..80a0d33a 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -7,9 +7,9 @@ import hashlib import re -import shutil import zipfile from pathlib import Path +from shutil import rmtree as remove_dir from typing import ByteString, Optional, Tuple from urllib import request @@ -71,7 +71,7 @@ def download_and_extract( # If we explicitly want to overwrite the extracted files, remove the if overwrite and dst_dir_path.is_dir(): - shutil.rmtree(dst_dir_path) + remove_dir(dst_dir_path) # Extract the files and return the path of the extraction directory return extract(src_file_path=src_file_path, dst_dir_path=dst_dir_path, skip_if_exists=not overwrite) diff --git a/tests/unit/utils/test_download.py b/tests/unit/utils/test_download.py index a35bd707..f6b430a1 100644 --- a/tests/unit/utils/test_download.py +++ b/tests/unit/utils/test_download.py @@ -1,7 +1,8 @@ # SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project # # SPDX-License-Identifier: MPL-2.0 -from unittest.mock import MagicMock +from pathlib import Path +from unittest.mock import MagicMock, patch from power_grid_model_io.utils.download import ( DownloadProgressHook, @@ -19,26 +20,95 @@ def test_progress_hook(): progress_bar.total = None hook = DownloadProgressHook(progress_bar) - # Act + # Act (block_num, block_size, file_size) hook(2, 10, 0) + assert progress_bar.total is None # total is not updated + + hook(3, 10, 123) + assert progress_bar.total == 123 # total is updated + + hook(6, 10, 123) # Assert - assert progress_bar.total is None # total is not updated - progress_bar.update.assert_called_once_with((2 - 0) * 10) + assert progress_bar.update.call_args_list[0].args == (20,) + assert progress_bar.update.call_args_list[1].args == (10,) + assert progress_bar.update.call_args_list[2].args == (30,) + +@patch("power_grid_model_io.utils.download.extract") +@patch("power_grid_model_io.utils.download.download") +def test_download_and_extract__paths(mock_download: MagicMock, mock_extract: MagicMock): # Arrange - progress_bar.reset_mock() + url = "https://www.source.com/remote.zip" + dir_path = Path("/tmp/dir-path/") + file_path = Path("file.zip") + src_file_path = Path("/tmp/dir-path/local.zip") + dst_dir_path = Path("/tmp/dir-path/local") + extract_dir_path = Path("/tmp/extract-path/local") # in practice this would be the same as dst_dir_path + + mock_download.return_value = src_file_path + mock_extract.return_value = extract_dir_path # Act - hook(5, 10, 123) + result = download_and_extract(url=url, dir_path=dir_path, file_path=file_path) # Assert - assert progress_bar.total == 123 # total is updated - progress_bar.update.assert_called_once_with((5 - 2) * 10) + mock_download.assert_called_once_with(url=url, file_path=file_path, dir_path=dir_path, overwrite=False) + mock_extract.assert_called_once_with(src_file_path=src_file_path, dst_dir_path=dst_dir_path, skip_if_exists=True) + assert result == extract_dir_path -def test_download_and_extract(): - download_and_extract +@patch("power_grid_model_io.utils.download.extract") +@patch("power_grid_model_io.utils.download.download") +def test_download_and_extract__no_paths(mock_download: MagicMock, mock_extract: MagicMock): + # Arrange + url = "https://www.source.com/remote.zip" + src_file_path = Path("current-working-dir/remote.zip") + dst_dir_path = Path("current-working-dir/remote") + extract_dir_path = Path("current-working-dir/extract") # in practice this would be the same as dst_dir_path + + mock_download.return_value = src_file_path + mock_extract.return_value = extract_dir_path + + # Act + result = download_and_extract(url=url) + + # Assert + mock_download.assert_called_once_with(url=url, file_path=None, dir_path=None, overwrite=False) + mock_extract.assert_called_once_with(src_file_path=src_file_path, dst_dir_path=dst_dir_path, skip_if_exists=True) + assert result == extract_dir_path + + +@patch("power_grid_model_io.utils.download.extract", new=MagicMock) +@patch("power_grid_model_io.utils.download.remove_dir") +@patch("power_grid_model_io.utils.download.Path.is_dir") +@patch("power_grid_model_io.utils.download.download") +def test_download_and_extract__overwrite(mock_download: MagicMock, mock_is_dir: MagicMock, mock_remove_dir: MagicMock): + # Arrange + src_file_path = Path("current-working-dir/remote.zip") + dst_dir_path = Path("current-working-dir/remote") + + mock_download.return_value = src_file_path + + # Act / Assert (dir does not exist, overwrite = False) + mock_is_dir.return_value = False + download_and_extract(url=MagicMock(), overwrite=False) + mock_remove_dir.assert_not_called() + + # Act / Assert (dir does not exist, overwrite = True) + mock_is_dir.return_value = False + download_and_extract(url=MagicMock(), overwrite=True) + mock_remove_dir.assert_not_called() + + # Act / Assert (dir does exist, overwrite = False) + mock_is_dir.return_value = True + download_and_extract(url=MagicMock(), overwrite=False) + mock_remove_dir.assert_not_called() + + # Act / Assert (dir does exist, overwrite = True) + mock_is_dir.return_value = True + download_and_extract(url=MagicMock(), overwrite=True) + mock_remove_dir.assert_called_once_with(dst_dir_path) def test_download(): From da6f76c94a6ed32140616702324299842f4ae15f Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Thu, 23 Feb 2023 16:49:04 +0100 Subject: [PATCH 11/50] Less mocking more actual file handling (in a temp dir) Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 16 +- tests/unit/utils/test_download.py | 186 +++++++++++++++++----- 2 files changed, 159 insertions(+), 43 deletions(-) diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index 80a0d33a..42ab30b3 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -16,6 +16,8 @@ import structlog from tqdm import tqdm +_log = structlog.get_logger(__name__) + class DownloadProgressHook: # pylint: disable=too-few-public-methods """ @@ -102,11 +104,12 @@ def download( file_path = Path(remote_file_name) file_path = get_download_path(dir_path=dir_path, file_path=file_path, data=url.encode()) - log = structlog.get_logger(__name__).bind(url=url, file_path=file_path) + log = _log.bind(url=url, file_path=file_path) if file_path.is_file(): if overwrite: log.debug("Forced re-downloading existing file") + # Don't remove the existing file just yet... Let's first see if we can download a new version. else: local_size = file_path.stat().st_size if local_size == remote_size: @@ -122,23 +125,26 @@ def download( # Download to a temp file first, so the results are not stored if the transfer fails with tqdm(desc="Downloading", unit="B", unit_scale=True, leave=True) as progress_bar: - temp_file, _headers = request.urlretrieve(url, reporthook=DownloadProgressHook(progress_bar)) + report_hook = DownloadProgressHook(progress_bar) + temp_file, _headers = request.urlretrieve(url, reporthook=report_hook) # Check if the file contains any content temp_path = Path(temp_file) if temp_path.stat().st_size == 0: log.warning("Downloaded an empty file") + # Remove the file, if it already exists + file_path.unlink(missing_ok=True) + # Move the file to it's final destination file_path.parent.mkdir(parents=True, exist_ok=True) - file_path.unlink(missing_ok=True) temp_path.rename(file_path) log.debug("Downloaded file", file_size=file_path.stat().st_size) return file_path -def get_url_headers(url: str) -> Tuple[int, int, str]: +def get_url_headers(url: str) -> Tuple[int, int, Optional[str]]: """ Retrieve the file size of a given URL (based on it's header) @@ -211,7 +217,7 @@ def extract(src_file_path: Path, dst_dir_path: Optional[Path] = None, skip_if_ex if dst_dir_path is None: dst_dir_path = src_file_path.with_suffix("") - log = structlog.get_logger(__name__).bind(src_file_path=src_file_path, dst_dir_path=dst_dir_path) + log = _log.bind(src_file_path=src_file_path, dst_dir_path=dst_dir_path) if dst_dir_path.exists(): if not dst_dir_path.is_dir(): diff --git a/tests/unit/utils/test_download.py b/tests/unit/utils/test_download.py index f6b430a1..fd73ce3e 100644 --- a/tests/unit/utils/test_download.py +++ b/tests/unit/utils/test_download.py @@ -1,8 +1,12 @@ # SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project # # SPDX-License-Identifier: MPL-2.0 +import tempfile from pathlib import Path -from unittest.mock import MagicMock, patch +from unittest.mock import ANY, MagicMock, patch + +import pytest +import structlog.testing from power_grid_model_io.utils.download import ( DownloadProgressHook, @@ -13,6 +17,25 @@ get_url_headers, ) +from ...utils import assert_log_exists + +ZIP_SIZE = 1024 + + +@pytest.fixture() +def temp_dir(): + with tempfile.TemporaryDirectory() as tmp: + yield Path(tmp) + + +def make_file(file_name: Path, file_size: int = 0): + with open(file_name, "wb") as fp: + fp.write(b"\0" * file_size) + + +def make_dir(dirname: Path): + dirname.mkdir(parents=True) + def test_progress_hook(): # Arrange @@ -37,14 +60,14 @@ def test_progress_hook(): @patch("power_grid_model_io.utils.download.extract") @patch("power_grid_model_io.utils.download.download") -def test_download_and_extract__paths(mock_download: MagicMock, mock_extract: MagicMock): +def test_download_and_extract__paths(mock_download: MagicMock, mock_extract: MagicMock, temp_dir: Path): # Arrange - url = "https://www.source.com/remote.zip" - dir_path = Path("/tmp/dir-path/") - file_path = Path("file.zip") - src_file_path = Path("/tmp/dir-path/local.zip") - dst_dir_path = Path("/tmp/dir-path/local") - extract_dir_path = Path("/tmp/extract-path/local") # in practice this would be the same as dst_dir_path + url = MagicMock() + dir_path = MagicMock() + file_path = MagicMock() + src_file_path = temp_dir / "data.zip" + dst_dir_path = temp_dir / "data" + extract_dir_path = MagicMock() mock_download.return_value = src_file_path mock_extract.return_value = extract_dir_path @@ -60,59 +83,146 @@ def test_download_and_extract__paths(mock_download: MagicMock, mock_extract: Mag @patch("power_grid_model_io.utils.download.extract") @patch("power_grid_model_io.utils.download.download") -def test_download_and_extract__no_paths(mock_download: MagicMock, mock_extract: MagicMock): +def test_download_and_extract__no_paths(mock_download: MagicMock, mock_extract: MagicMock, temp_dir: Path): # Arrange - url = "https://www.source.com/remote.zip" - src_file_path = Path("current-working-dir/remote.zip") - dst_dir_path = Path("current-working-dir/remote") - extract_dir_path = Path("current-working-dir/extract") # in practice this would be the same as dst_dir_path + url = MagicMock() + src_file_path = temp_dir / "data.zip" + dst_dir_path = temp_dir / "data" mock_download.return_value = src_file_path - mock_extract.return_value = extract_dir_path # Act - result = download_and_extract(url=url) + download_and_extract(url=url) # Assert mock_download.assert_called_once_with(url=url, file_path=None, dir_path=None, overwrite=False) mock_extract.assert_called_once_with(src_file_path=src_file_path, dst_dir_path=dst_dir_path, skip_if_exists=True) - assert result == extract_dir_path @patch("power_grid_model_io.utils.download.extract", new=MagicMock) -@patch("power_grid_model_io.utils.download.remove_dir") -@patch("power_grid_model_io.utils.download.Path.is_dir") @patch("power_grid_model_io.utils.download.download") -def test_download_and_extract__overwrite(mock_download: MagicMock, mock_is_dir: MagicMock, mock_remove_dir: MagicMock): +def test_download_and_extract__overwrite(mock_download: MagicMock, temp_dir: Path): # Arrange - src_file_path = Path("current-working-dir/remote.zip") - dst_dir_path = Path("current-working-dir/remote") - + src_file_path = temp_dir / "data.zip" mock_download.return_value = src_file_path - # Act / Assert (dir does not exist, overwrite = False) - mock_is_dir.return_value = False - download_and_extract(url=MagicMock(), overwrite=False) - mock_remove_dir.assert_not_called() + dst_dir_path = temp_dir / "data" + make_dir(dst_dir_path) - # Act / Assert (dir does not exist, overwrite = True) - mock_is_dir.return_value = False - download_and_extract(url=MagicMock(), overwrite=True) - mock_remove_dir.assert_not_called() - - # Act / Assert (dir does exist, overwrite = False) - mock_is_dir.return_value = True + # Act / Assert download_and_extract(url=MagicMock(), overwrite=False) - mock_remove_dir.assert_not_called() + assert dst_dir_path.is_dir() # Act / Assert (dir does exist, overwrite = True) - mock_is_dir.return_value = True download_and_extract(url=MagicMock(), overwrite=True) - mock_remove_dir.assert_called_once_with(dst_dir_path) + assert not dst_dir_path.exists() + + +@patch("power_grid_model_io.utils.download.DownloadProgressHook") +@patch("power_grid_model_io.utils.download.request.urlretrieve") +@patch("power_grid_model_io.utils.download.tqdm") +@patch("power_grid_model_io.utils.download.get_download_path") +@patch("power_grid_model_io.utils.download.get_url_headers") +def test_download( + mock_url_headers: MagicMock, + mock_download_path: MagicMock, + mock_tqdm: MagicMock, + mock_urlretrieve: MagicMock, + mock_hook: MagicMock, + temp_dir: Path, +): + # Arrange + url = "https://www.source.com" + dir_path = temp_dir / "data" + file_path = temp_dir / "data.zip" + temp_file = temp_dir / "data.download" + download_path = temp_dir / "data.zip" + + def urlretrieve(*_args, **_kwargs): + make_file(temp_file, ZIP_SIZE) + return temp_file, None + + mock_url_headers.return_value = (200, ZIP_SIZE, "remote.zip") + mock_download_path.return_value = download_path + mock_urlretrieve.side_effect = urlretrieve + # Act / assert + with structlog.testing.capture_logs() as capture: + result = download(url=url, file_path=file_path, dir_path=dir_path) + assert_log_exists(capture, "debug", "Downloading file") + + # Assert + mock_download_path.assert_called_once_with(dir_path=dir_path, file_path=file_path, data=b"https://www.source.com") + mock_tqdm.assert_called_once() + mock_hook.assert_called_once_with(mock_tqdm.return_value.__enter__.return_value) + mock_urlretrieve.assert_called_once_with(url, reporthook=mock_hook.return_value) + assert result == file_path + assert result.is_file() + assert result.stat().st_size == ZIP_SIZE + + +@patch("power_grid_model_io.utils.download.DownloadProgressHook", new=MagicMock()) +@patch("power_grid_model_io.utils.download.request.urlretrieve") +@patch("power_grid_model_io.utils.download.tqdm", new=MagicMock()) +@patch("power_grid_model_io.utils.download.get_download_path") +@patch("power_grid_model_io.utils.download.get_url_headers") +def test_download__auto_file_name( + mock_url_headers: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path +): + # Arrange + temp_file = temp_dir / "data.download" + download_path = temp_dir / "data.zip" + + def urlretrieve(*_args, **_kwargs): + make_file(temp_file, 0) + return temp_file, None + + mock_url_headers.return_value = (200, 0, "remote.zip") + mock_download_path.return_value = download_path + mock_urlretrieve.side_effect = urlretrieve + + # Act + download(url=MagicMock()) + + # Assert + mock_download_path.assert_called_once_with(dir_path=None, file_path=Path("remote.zip"), data=ANY) + + +@patch("power_grid_model_io.utils.download.DownloadProgressHook", new=MagicMock()) +@patch("power_grid_model_io.utils.download.request.urlretrieve") +@patch("power_grid_model_io.utils.download.tqdm", new=MagicMock()) +@patch("power_grid_model_io.utils.download.get_download_path") +@patch("power_grid_model_io.utils.download.get_url_headers") +def test_download__empty_file( + mock_url_headers: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path +): + # Arrange + temp_file = temp_dir / "data.download" + download_path = temp_dir / "data.zip" + + def urlretrieve(*_args, **_kwargs): + make_file(temp_file, 0) + return temp_file, None + + mock_url_headers.return_value = (200, 0, "remote.zip") + mock_download_path.return_value = download_path + mock_urlretrieve.side_effect = urlretrieve + + # Act / Assert + with structlog.testing.capture_logs() as capture: + download(url=MagicMock()) + assert_log_exists(capture, "debug", "Downloading file") + assert_log_exists(capture, "warning", "Downloaded an empty file") + + +@patch("power_grid_model_io.utils.download.get_url_headers") +def test_download__status_error(mock_url_headers: MagicMock): + # Arrange + mock_url_headers.return_value = (403, 0, None) -def test_download(): - download + # Act / Assert + with pytest.raises(IOError, match=r"Could not download from URL, status=403"): + download(url=MagicMock()) def test_get_url_headers(): From 9cf3a750114e58f85c30ee5d3ffe0367581abf10 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Thu, 23 Feb 2023 16:59:11 +0100 Subject: [PATCH 12/50] Unit tests for download() Signed-off-by: Bram Stoeller --- tests/unit/utils/test_download.py | 88 +++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/tests/unit/utils/test_download.py b/tests/unit/utils/test_download.py index fd73ce3e..bf5de783 100644 --- a/tests/unit/utils/test_download.py +++ b/tests/unit/utils/test_download.py @@ -215,6 +215,94 @@ def urlretrieve(*_args, **_kwargs): assert_log_exists(capture, "warning", "Downloaded an empty file") +@patch("power_grid_model_io.utils.download.DownloadProgressHook", new=MagicMock()) +@patch("power_grid_model_io.utils.download.request.urlretrieve") +@patch("power_grid_model_io.utils.download.tqdm", new=MagicMock()) +@patch("power_grid_model_io.utils.download.get_download_path") +@patch("power_grid_model_io.utils.download.get_url_headers") +def test_download__skip_existing_file( + mock_url_headers: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path +): + # Arrange + download_path = temp_dir / "data.zip" + make_file(download_path, ZIP_SIZE) + + mock_url_headers.return_value = (200, ZIP_SIZE, "remote.zip") + mock_download_path.return_value = download_path + + # Act + with structlog.testing.capture_logs() as capture: + download(url=MagicMock()) + assert_log_exists(capture, "debug", "Skip downloading existing file") + + # Assert + mock_urlretrieve.assert_not_called() + + +@patch("power_grid_model_io.utils.download.DownloadProgressHook", new=MagicMock()) +@patch("power_grid_model_io.utils.download.request.urlretrieve") +@patch("power_grid_model_io.utils.download.tqdm", new=MagicMock()) +@patch("power_grid_model_io.utils.download.get_download_path") +@patch("power_grid_model_io.utils.download.get_url_headers") +def test_download__update_file( + mock_url_headers: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path +): + # Arrange + temp_file = temp_dir / "data.download" + download_path = temp_dir / "data.zip" + make_file(download_path, ZIP_SIZE) + + def urlretrieve(*_args, **_kwargs): + make_file(temp_file, ZIP_SIZE + 1) + return temp_file, None + + mock_url_headers.return_value = (200, ZIP_SIZE + 1, "remote.zip") + mock_download_path.return_value = download_path + mock_urlretrieve.side_effect = urlretrieve + + # Act / Assert + with structlog.testing.capture_logs() as capture: + result = download(url=MagicMock()) + assert_log_exists(capture, "debug", "Re-downloading existing file, because the size has changed") + + # Assert + assert result == download_path + assert result.is_file() + assert result.stat().st_size == ZIP_SIZE + 1 + + +@patch("power_grid_model_io.utils.download.DownloadProgressHook", new=MagicMock()) +@patch("power_grid_model_io.utils.download.request.urlretrieve") +@patch("power_grid_model_io.utils.download.tqdm", new=MagicMock()) +@patch("power_grid_model_io.utils.download.get_download_path") +@patch("power_grid_model_io.utils.download.get_url_headers") +def test_download__overwrite( + mock_url_headers: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path +): + # Arrange + temp_file = temp_dir / "data.download" + download_path = temp_dir / "data.zip" + make_file(download_path, ZIP_SIZE) + + def urlretrieve(*_args, **_kwargs): + make_file(temp_file, ZIP_SIZE) + return temp_file, None + + mock_url_headers.return_value = (200, ZIP_SIZE, "remote.zip") + mock_download_path.return_value = download_path + mock_urlretrieve.side_effect = urlretrieve + + # Act / Assert + with structlog.testing.capture_logs() as capture: + result = download(url=MagicMock(), overwrite=True) + assert_log_exists(capture, "debug", "Forced re-downloading existing file") + + # Assert + assert result == download_path + assert result.is_file() + assert result.stat().st_size == ZIP_SIZE + + @patch("power_grid_model_io.utils.download.get_url_headers") def test_download__status_error(mock_url_headers: MagicMock): # Arrange From eea08865c0acd8ffb68cc75668bc774126f31cd1 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Tue, 28 Feb 2023 16:19:11 +0100 Subject: [PATCH 13/50] get_response_info unit tests Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 36 ++++++--- tests/unit/utils/test_download.py | 97 +++++++++++++++++------ 2 files changed, 95 insertions(+), 38 deletions(-) diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index 42ab30b3..1f683fae 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -8,9 +8,10 @@ import hashlib import re import zipfile +from dataclasses import dataclass from pathlib import Path from shutil import rmtree as remove_dir -from typing import ByteString, Optional, Tuple +from typing import ByteString, Optional from urllib import request import structlog @@ -19,6 +20,17 @@ _log = structlog.get_logger(__name__) +@dataclass +class ResponseInfo: + """ + Struct to store response information extracted from the response header + """ + + status: int + file_name: Optional[str] = None + file_size: Optional[int] = None + + class DownloadProgressHook: # pylint: disable=too-few-public-methods """ Report hook for request.urlretrieve() to update a progress bar based on the amount of downloaded blocks @@ -96,12 +108,12 @@ def download( Returns: The path to the downloaded file """ - status_code, remote_size, remote_file_name = get_url_headers(url=url) - if status_code != 200: - raise IOError(f"Could not download from URL, status={status_code}") + info = get_response_info(url=url) + if info.status != 200: + raise IOError(f"Could not download from URL, status={info.status}") - if file_path is None and remote_file_name: - file_path = Path(remote_file_name) + if file_path is None and info.file_name: + file_path = Path(info.file_name) file_path = get_download_path(dir_path=dir_path, file_path=file_path, data=url.encode()) log = _log.bind(url=url, file_path=file_path) @@ -112,13 +124,13 @@ def download( # Don't remove the existing file just yet... Let's first see if we can download a new version. else: local_size = file_path.stat().st_size - if local_size == remote_size: + if local_size == info.file_size: log.debug("Skip downloading existing file") return file_path log.debug( "Re-downloading existing file, because the size has changed", local_size=local_size, - remote_size=remote_size, + remote_size=info.file_size, ) else: log.debug("Downloading file") @@ -144,7 +156,7 @@ def download( return file_path -def get_url_headers(url: str) -> Tuple[int, int, Optional[str]]: +def get_response_info(url: str) -> ResponseInfo: """ Retrieve the file size of a given URL (based on it's header) @@ -155,13 +167,13 @@ def get_url_headers(url: str) -> Tuple[int, int, Optional[str]]: The file size in bytes """ with request.urlopen(url) as context: - status_code = context.status + status = context.status headers = context.headers - file_size = int(headers.get("Content-Length", 0)) + file_size = int(headers["Content-Length"]) if "Content-Length" in headers else None matches = re.findall(r"filename=\"(.+)\"", headers.get("Content-Disposition", "")) file_name = matches[0] if matches else None - return status_code, file_size, file_name + return ResponseInfo(status=status, file_size=file_size, file_name=file_name) def get_download_path(dir_path: Optional[Path], file_path: Optional[Path], data=Optional[ByteString]) -> Path: diff --git a/tests/unit/utils/test_download.py b/tests/unit/utils/test_download.py index bf5de783..1bdc2072 100644 --- a/tests/unit/utils/test_download.py +++ b/tests/unit/utils/test_download.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: MPL-2.0 import tempfile +from collections import namedtuple from pathlib import Path from unittest.mock import ANY, MagicMock, patch @@ -10,16 +11,19 @@ from power_grid_model_io.utils.download import ( DownloadProgressHook, + ResponseInfo, download, download_and_extract, extract, get_download_path, - get_url_headers, + get_response_info, ) from ...utils import assert_log_exists -ZIP_SIZE = 1024 +ZIP_SIZE = 1024 # 1kb + +Response = namedtuple("Response", ["status", "headers"]) @pytest.fixture() @@ -122,9 +126,9 @@ def test_download_and_extract__overwrite(mock_download: MagicMock, temp_dir: Pat @patch("power_grid_model_io.utils.download.request.urlretrieve") @patch("power_grid_model_io.utils.download.tqdm") @patch("power_grid_model_io.utils.download.get_download_path") -@patch("power_grid_model_io.utils.download.get_url_headers") +@patch("power_grid_model_io.utils.download.get_response_info") def test_download( - mock_url_headers: MagicMock, + mock_info: MagicMock, mock_download_path: MagicMock, mock_tqdm: MagicMock, mock_urlretrieve: MagicMock, @@ -142,7 +146,7 @@ def urlretrieve(*_args, **_kwargs): make_file(temp_file, ZIP_SIZE) return temp_file, None - mock_url_headers.return_value = (200, ZIP_SIZE, "remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE, file_name="remote.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -165,9 +169,9 @@ def urlretrieve(*_args, **_kwargs): @patch("power_grid_model_io.utils.download.request.urlretrieve") @patch("power_grid_model_io.utils.download.tqdm", new=MagicMock()) @patch("power_grid_model_io.utils.download.get_download_path") -@patch("power_grid_model_io.utils.download.get_url_headers") +@patch("power_grid_model_io.utils.download.get_response_info") def test_download__auto_file_name( - mock_url_headers: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path + mock_info: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path ): # Arrange temp_file = temp_dir / "data.download" @@ -177,7 +181,7 @@ def urlretrieve(*_args, **_kwargs): make_file(temp_file, 0) return temp_file, None - mock_url_headers.return_value = (200, 0, "remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=None, file_name="remote.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -192,9 +196,9 @@ def urlretrieve(*_args, **_kwargs): @patch("power_grid_model_io.utils.download.request.urlretrieve") @patch("power_grid_model_io.utils.download.tqdm", new=MagicMock()) @patch("power_grid_model_io.utils.download.get_download_path") -@patch("power_grid_model_io.utils.download.get_url_headers") +@patch("power_grid_model_io.utils.download.get_response_info") def test_download__empty_file( - mock_url_headers: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path + mock_info: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path ): # Arrange temp_file = temp_dir / "data.download" @@ -204,7 +208,7 @@ def urlretrieve(*_args, **_kwargs): make_file(temp_file, 0) return temp_file, None - mock_url_headers.return_value = (200, 0, "remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=None, file_name="remote.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -219,15 +223,15 @@ def urlretrieve(*_args, **_kwargs): @patch("power_grid_model_io.utils.download.request.urlretrieve") @patch("power_grid_model_io.utils.download.tqdm", new=MagicMock()) @patch("power_grid_model_io.utils.download.get_download_path") -@patch("power_grid_model_io.utils.download.get_url_headers") +@patch("power_grid_model_io.utils.download.get_response_info") def test_download__skip_existing_file( - mock_url_headers: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path + mock_info: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path ): # Arrange download_path = temp_dir / "data.zip" make_file(download_path, ZIP_SIZE) - mock_url_headers.return_value = (200, ZIP_SIZE, "remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE, file_name="remote.zip") mock_download_path.return_value = download_path # Act @@ -243,9 +247,9 @@ def test_download__skip_existing_file( @patch("power_grid_model_io.utils.download.request.urlretrieve") @patch("power_grid_model_io.utils.download.tqdm", new=MagicMock()) @patch("power_grid_model_io.utils.download.get_download_path") -@patch("power_grid_model_io.utils.download.get_url_headers") +@patch("power_grid_model_io.utils.download.get_response_info") def test_download__update_file( - mock_url_headers: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path + mock_info: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path ): # Arrange temp_file = temp_dir / "data.download" @@ -256,7 +260,7 @@ def urlretrieve(*_args, **_kwargs): make_file(temp_file, ZIP_SIZE + 1) return temp_file, None - mock_url_headers.return_value = (200, ZIP_SIZE + 1, "remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE + 1, file_name="remote.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -275,9 +279,9 @@ def urlretrieve(*_args, **_kwargs): @patch("power_grid_model_io.utils.download.request.urlretrieve") @patch("power_grid_model_io.utils.download.tqdm", new=MagicMock()) @patch("power_grid_model_io.utils.download.get_download_path") -@patch("power_grid_model_io.utils.download.get_url_headers") +@patch("power_grid_model_io.utils.download.get_response_info") def test_download__overwrite( - mock_url_headers: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path + mock_info: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path ): # Arrange temp_file = temp_dir / "data.download" @@ -288,7 +292,7 @@ def urlretrieve(*_args, **_kwargs): make_file(temp_file, ZIP_SIZE) return temp_file, None - mock_url_headers.return_value = (200, ZIP_SIZE, "remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE, file_name="remote.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -303,18 +307,59 @@ def urlretrieve(*_args, **_kwargs): assert result.stat().st_size == ZIP_SIZE -@patch("power_grid_model_io.utils.download.get_url_headers") -def test_download__status_error(mock_url_headers: MagicMock): +@patch("power_grid_model_io.utils.download.get_response_info") +def test_download__status_error(mock_info: MagicMock): # Arrange - mock_url_headers.return_value = (403, 0, None) + mock_info.return_value = ResponseInfo(status=404, file_size=None, file_name=None) # Act / Assert - with pytest.raises(IOError, match=r"Could not download from URL, status=403"): + with pytest.raises(IOError, match=r"Could not download from URL, status=404"): download(url=MagicMock()) -def test_get_url_headers(): - get_url_headers +@patch("power_grid_model_io.utils.download.request.urlopen") +def test_get_response_info(mock_urlopen): + + # Arrange + headers = {"Content-Length": "456", "Content-Disposition": 'form-data; name="ZipFile"; filename="filename.zip"'} + mock_urlopen.return_value.__enter__.return_value = Response(status=123, headers=headers) + + # Act / Assert + assert get_response_info("") == ResponseInfo(status=123, file_size=456, file_name="filename.zip") + + +@patch("power_grid_model_io.utils.download.request.urlopen") +def test_get_response_info__no_file_name(mock_urlopen): + + # Arrange + headers = {"Content-Length": "456", "Content-Disposition": 'form-data; name="ZipFile"'} + mock_urlopen.return_value.__enter__.return_value = Response(status=123, headers=headers) + + # Act / Assert + assert get_response_info("") == ResponseInfo(status=123, file_size=456, file_name=None) + + +@patch("power_grid_model_io.utils.download.request.urlopen") +def test_get_response_info__no_disposition(mock_urlopen): + + # Arrange + headers = {"Content-Length": "456"} + Context = namedtuple("Context", ["status", "headers"]) + mock_urlopen.return_value.__enter__.return_value = Context(status=123, headers=headers) + + # Act / Assert + assert get_response_info("") == ResponseInfo(status=123, file_size=456, file_name=None) + + +@patch("power_grid_model_io.utils.download.request.urlopen") +def test_get_response_info__no_length(mock_urlopen): + + # Arrange + headers = {"Content-Disposition": 'form-data; name="ZipFile"; filename="filename.zip"'} + mock_urlopen.return_value.__enter__.return_value = Response(status=123, headers=headers) + + # Act / Assert + assert get_response_info("") == ResponseInfo(status=123, file_size=None, file_name="filename.zip") def test_get_download_path(): From 0adc6dd8ca1a5c5fd0b990e7aee5a754f307d3dd Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Tue, 28 Feb 2023 17:03:25 +0100 Subject: [PATCH 14/50] Unit tests for get_download_path() Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 15 ++++++-- tests/unit/utils/test_download.py | 45 ++++++++++++++++++++--- 2 files changed, 50 insertions(+), 10 deletions(-) diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index 1f683fae..49107b27 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -11,7 +11,7 @@ from dataclasses import dataclass from pathlib import Path from shutil import rmtree as remove_dir -from typing import ByteString, Optional +from typing import Optional from urllib import request import structlog @@ -176,7 +176,9 @@ def get_response_info(url: str) -> ResponseInfo: return ResponseInfo(status=status, file_size=file_size, file_name=file_name) -def get_download_path(dir_path: Optional[Path], file_path: Optional[Path], data=Optional[ByteString]) -> Path: +def get_download_path( + dir_path: Optional[Path] = None, file_path: Optional[Path] = None, data: Optional[bytes] = None +) -> Path: """ Determine the file path based on dir_path, file_path and/or data @@ -190,19 +192,24 @@ def get_download_path(dir_path: Optional[Path], file_path: Optional[Path], data= # If no file_path is given, generate a file name if file_path is None: + if data is None: + raise ValueError("Supply data in order to auto generate a download path.") try: sha256 = hashlib.sha256() sha256.update(data) hash_str = sha256.hexdigest() except (TypeError, ValueError) as ex: - raise ValueError(f"Can't auto generate a file name for a {type(data).__name__}.") from ex - file_path = Path(__name__) / f"{hash_str}.download" + raise ValueError( + f"Can't auto generate a download path for '{type(data).__name__}' data (convert it to bytes first)." + ) from ex + file_path = Path(f"{hash_str}.download") # If no dir_path is given, use current working dir elif dir_path is None: dir_path = Path(".") # Combine the two paths + assert file_path is not None file_path = (dir_path / file_path).resolve() if dir_path else file_path.resolve() # If the file_path exists, it should be a file (not a dir) diff --git a/tests/unit/utils/test_download.py b/tests/unit/utils/test_download.py index 1bdc2072..84028290 100644 --- a/tests/unit/utils/test_download.py +++ b/tests/unit/utils/test_download.py @@ -319,7 +319,6 @@ def test_download__status_error(mock_info: MagicMock): @patch("power_grid_model_io.utils.download.request.urlopen") def test_get_response_info(mock_urlopen): - # Arrange headers = {"Content-Length": "456", "Content-Disposition": 'form-data; name="ZipFile"; filename="filename.zip"'} mock_urlopen.return_value.__enter__.return_value = Response(status=123, headers=headers) @@ -330,7 +329,6 @@ def test_get_response_info(mock_urlopen): @patch("power_grid_model_io.utils.download.request.urlopen") def test_get_response_info__no_file_name(mock_urlopen): - # Arrange headers = {"Content-Length": "456", "Content-Disposition": 'form-data; name="ZipFile"'} mock_urlopen.return_value.__enter__.return_value = Response(status=123, headers=headers) @@ -341,7 +339,6 @@ def test_get_response_info__no_file_name(mock_urlopen): @patch("power_grid_model_io.utils.download.request.urlopen") def test_get_response_info__no_disposition(mock_urlopen): - # Arrange headers = {"Content-Length": "456"} Context = namedtuple("Context", ["status", "headers"]) @@ -353,7 +350,6 @@ def test_get_response_info__no_disposition(mock_urlopen): @patch("power_grid_model_io.utils.download.request.urlopen") def test_get_response_info__no_length(mock_urlopen): - # Arrange headers = {"Content-Disposition": 'form-data; name="ZipFile"; filename="filename.zip"'} mock_urlopen.return_value.__enter__.return_value = Response(status=123, headers=headers) @@ -362,8 +358,45 @@ def test_get_response_info__no_length(mock_urlopen): assert get_response_info("") == ResponseInfo(status=123, file_size=None, file_name="filename.zip") -def test_get_download_path(): - get_download_path +def test_get_download_path(temp_dir): + # Act + path = get_download_path(dir_path=temp_dir, file_path=Path("file_name.zip"), data=b"foo") + + # Assert + assert path == temp_dir / "file_name.zip" + + +def test_get_download_path__auto_dir(): + # Act + path = get_download_path(file_path=Path("file_name.zip")) + + # Assert + assert path == Path(".").absolute() / "file_name.zip" + + +def test_get_download_path__auto_file_name(temp_dir): + # Arrange + foo_sha256_hash = "2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae" + + # Act + path = get_download_path(dir_path=temp_dir, data=b"foo") + + # Assert + assert path == temp_dir / f"{foo_sha256_hash}.download" + + +def test_get_download_path__invalid_data_type(temp_dir): + # Act / Assert + with pytest.raises( + ValueError, match=r"Can't auto generate a download path for 'str' data \(convert it to bytes first\)\." + ): + get_download_path(data="foo") # type: ignore + + +def test_get_download_path__missing_data(temp_dir): + # Act / Assert + with pytest.raises(ValueError, match=r"Supply data in order to auto generate a download path\."): + get_download_path(dir_path=temp_dir) def test_extract(): From c654ca5df8af18e4c6270c592579b1543f2cf251 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Wed, 1 Mar 2023 10:20:16 +0100 Subject: [PATCH 15/50] Refactoring Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 83 ++++++++++++----------- tests/unit/utils/test_download.py | 56 ++++++++------- 2 files changed, 75 insertions(+), 64 deletions(-) diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index 49107b27..76ef25f4 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -5,13 +5,15 @@ Helper functions to download (and store) files from the internet """ +import base64 import hashlib import re +import tempfile import zipfile from dataclasses import dataclass from pathlib import Path from shutil import rmtree as remove_dir -from typing import Optional +from typing import Optional, Union from urllib import request import structlog @@ -61,7 +63,7 @@ def __call__(self, block_num: int, block_size: int, file_size: int) -> None: def download_and_extract( - url: str, dir_path: Optional[Path] = None, file_path: Optional[Path] = None, overwrite: bool = False + url: str, dir_path: Optional[Path] = None, file_name: Optional[Union[str, Path]] = None, overwrite: bool = False ) -> Path: """ Download a file from a URL and store it locally, extract the contents and return the path to the contents. @@ -70,7 +72,7 @@ def download_and_extract( url: The url to the .zip file dir_path: An optional dir path to store the downloaded file. If no dir_path is given the current working dir will be used. - file_path: An optional file path (absolute or relative to dir_path). If no file_path is given, a file name is + file_name: An optional file name (or path relative to dir_path). If no file_name is given, a file name is generated based on the url overwrite: Should we download the file, even if we have downloaded already (and the file size still matches)? Be careful with this option, as it will remove files from your drive irreversibly! @@ -80,7 +82,7 @@ def download_and_extract( """ # Download the file and use the file name as the base name for the extraction directory - src_file_path = download(url=url, file_path=file_path, dir_path=dir_path, overwrite=overwrite) + src_file_path = download(url=url, file_name=file_name, dir_path=dir_path, overwrite=overwrite) dst_dir_path = src_file_path.with_suffix("") # If we explicitly want to overwrite the extracted files, remove the @@ -92,14 +94,14 @@ def download_and_extract( def download( - url: str, file_path: Optional[Path] = None, dir_path: Optional[Path] = None, overwrite: bool = False + url: str, file_name: Optional[Union[str, Path]] = None, dir_path: Optional[Path] = None, overwrite: bool = False ) -> Path: """ Download a file from a URL and store it locally Args: url: The url to the file - file_path: An optional file path (absolute or relative to dir_path). If no file_path is given, a file name is + file_name: An optional file name (or path relative to dir_path). If no file_name is given, a file name is generated based on the url dir_path: An optional dir path to store the downloaded file. If no dir_path is given the current working dir will be used. @@ -108,14 +110,16 @@ def download( Returns: The path to the downloaded file """ + + # get the response info, if the status is not 200 info = get_response_info(url=url) if info.status != 200: raise IOError(f"Could not download from URL, status={info.status}") - if file_path is None and info.file_name: - file_path = Path(info.file_name) + if file_name is None and info.file_name: + file_name = info.file_name - file_path = get_download_path(dir_path=dir_path, file_path=file_path, data=url.encode()) + file_path = get_download_path(dir_path=dir_path, file_name=file_name, unique_key=url) log = _log.bind(url=url, file_path=file_path) if file_path.is_file(): @@ -177,40 +181,39 @@ def get_response_info(url: str) -> ResponseInfo: def get_download_path( - dir_path: Optional[Path] = None, file_path: Optional[Path] = None, data: Optional[bytes] = None + dir_path: Optional[Path] = None, + file_name: Optional[Union[str, Path]] = None, + unique_key: Optional[str] = None, ) -> Path: """ - Determine the file path based on dir_path, file_path and/or data + Determine the file path based on dir_path, file_name and/or data Args: - dir_path: An optional dir path to store the downloaded file. If no dir_path is given the system's temp dir will - be used. Use Path(".") if you intend to use the current working directory. - file_path: An optional file path (absolute or relative to dir_path). If no file_path is given, a file name is - generated based on the url - data: A bytestring that can be used to generate a filename. + dir_path: An optional dir path to store the downloaded file. If no dir_path is given the system's temp dir + will be used. If omitted, the tempfolder is used. + file_name: An optional file name (or path relative to dir_path). If no file_name is given, a file name is + generated based on the unique key (e.g. an url) + unique_key: A unique string that can be used to generate a filename (e.g. a url). """ - # If no file_path is given, generate a file name - if file_path is None: - if data is None: + # If no file_name is given, generate a file name + if file_name is None: + if unique_key is None: raise ValueError("Supply data in order to auto generate a download path.") - try: - sha256 = hashlib.sha256() - sha256.update(data) - hash_str = sha256.hexdigest() - except (TypeError, ValueError) as ex: - raise ValueError( - f"Can't auto generate a download path for '{type(data).__name__}' data (convert it to bytes first)." - ) from ex - file_path = Path(f"{hash_str}.download") - - # If no dir_path is given, use current working dir + + sha256 = hashlib.sha256() + sha256.update(unique_key.encode()) + hash_str = base64.b64encode(sha256.digest()).decode("ascii") + hash_str = hash_str.replace("/", "_").replace("+", "-").rstrip("=") + file_name = Path(f"{hash_str}.download") + + # If no dir_path is given, use the system's designated folder for temporary files elif dir_path is None: - dir_path = Path(".") + dir_path = Path(tempfile.gettempdir()) # Combine the two paths - assert file_path is not None - file_path = (dir_path / file_path).resolve() if dir_path else file_path.resolve() + assert file_name is not None + file_path = (dir_path / file_name).resolve() if dir_path else Path(file_name).resolve() # If the file_path exists, it should be a file (not a dir) if file_path.exists() and not file_path.is_file(): @@ -225,14 +228,15 @@ def extract(src_file_path: Path, dst_dir_path: Optional[Path] = None, skip_if_ex Args: src_file_path: The .zip file to extract. - src_file_path: An optional destination path. If none is given, the src_file_path wihout .zip extension is used. - skip_if_exists: If true, it returns the dir path, otherwise raise an exception. + dst_dir_path: An optional destination path. If none is given, the src_file_path without .zip extension is used. + skip_if_exists: Skip existing files, otherwise raise an exception when a file exists. Returns: The path where the files are extracted """ if src_file_path.suffix.lower() != ".zip": raise ValueError(f"Only files with .zip extension are supported, got {src_file_path}") + if dst_dir_path is None: dst_dir_path = src_file_path.with_suffix("") @@ -241,9 +245,6 @@ def extract(src_file_path: Path, dst_dir_path: Optional[Path] = None, skip_if_ex if dst_dir_path.exists(): if not dst_dir_path.is_dir(): raise NotADirectoryError(f"Destination dir {dst_dir_path} exists and is not a directory") - if not skip_if_exists: - raise FileExistsError(f"Destination dir {dst_dir_path} exists and is not empty") - log.debug("Skip extraction, destination dir exists") else: # Create the destination directory @@ -253,6 +254,12 @@ def extract(src_file_path: Path, dst_dir_path: Optional[Path] = None, skip_if_ex with zipfile.ZipFile(src_file_path, "r") as zip_file: file_list = zip_file.namelist() for file_path in tqdm(desc="Extracting", iterable=file_list, total=len(file_list), unit="file", leave=True): + dst_file_path = dst_dir_path / file_path + if dst_file_path.exists() and dst_file_path.stat().st_size > 0: + if skip_if_exists: + log.debug("Skip file extraction, destination file exists", dst_file_path=dst_file_path) + continue + raise FileExistsError(f"Destination file {dst_dir_path / file_path} exists and is not empty") zip_file.extract(member=file_path, path=dst_dir_path) # If the zip files contains a single directory with the same name as the zip file, return that dir diff --git a/tests/unit/utils/test_download.py b/tests/unit/utils/test_download.py index 84028290..112cb8be 100644 --- a/tests/unit/utils/test_download.py +++ b/tests/unit/utils/test_download.py @@ -77,10 +77,10 @@ def test_download_and_extract__paths(mock_download: MagicMock, mock_extract: Mag mock_extract.return_value = extract_dir_path # Act - result = download_and_extract(url=url, dir_path=dir_path, file_path=file_path) + result = download_and_extract(url=url, dir_path=dir_path, file_name=file_path) # Assert - mock_download.assert_called_once_with(url=url, file_path=file_path, dir_path=dir_path, overwrite=False) + mock_download.assert_called_once_with(url=url, file_name=file_path, dir_path=dir_path, overwrite=False) mock_extract.assert_called_once_with(src_file_path=src_file_path, dst_dir_path=dst_dir_path, skip_if_exists=True) assert result == extract_dir_path @@ -99,7 +99,7 @@ def test_download_and_extract__no_paths(mock_download: MagicMock, mock_extract: download_and_extract(url=url) # Assert - mock_download.assert_called_once_with(url=url, file_path=None, dir_path=None, overwrite=False) + mock_download.assert_called_once_with(url=url, file_name=None, dir_path=None, overwrite=False) mock_extract.assert_called_once_with(src_file_path=src_file_path, dst_dir_path=dst_dir_path, skip_if_exists=True) @@ -146,17 +146,19 @@ def urlretrieve(*_args, **_kwargs): make_file(temp_file, ZIP_SIZE) return temp_file, None - mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE, file_name="remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE, file_name="remote1.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve # Act / assert with structlog.testing.capture_logs() as capture: - result = download(url=url, file_path=file_path, dir_path=dir_path) + result = download(url=url, file_name=file_path, dir_path=dir_path) assert_log_exists(capture, "debug", "Downloading file") # Assert - mock_download_path.assert_called_once_with(dir_path=dir_path, file_path=file_path, data=b"https://www.source.com") + mock_download_path.assert_called_once_with( + dir_path=dir_path, file_name=file_path, unique_key="https://www.source.com" + ) mock_tqdm.assert_called_once() mock_hook.assert_called_once_with(mock_tqdm.return_value.__enter__.return_value) mock_urlretrieve.assert_called_once_with(url, reporthook=mock_hook.return_value) @@ -181,7 +183,7 @@ def urlretrieve(*_args, **_kwargs): make_file(temp_file, 0) return temp_file, None - mock_info.return_value = ResponseInfo(status=200, file_size=None, file_name="remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=None, file_name="remote2.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -189,7 +191,7 @@ def urlretrieve(*_args, **_kwargs): download(url=MagicMock()) # Assert - mock_download_path.assert_called_once_with(dir_path=None, file_path=Path("remote.zip"), data=ANY) + mock_download_path.assert_called_once_with(dir_path=None, file_name="remote2.zip", unique_key=ANY) @patch("power_grid_model_io.utils.download.DownloadProgressHook", new=MagicMock()) @@ -208,7 +210,7 @@ def urlretrieve(*_args, **_kwargs): make_file(temp_file, 0) return temp_file, None - mock_info.return_value = ResponseInfo(status=200, file_size=None, file_name="remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=None, file_name="remote3.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -231,7 +233,7 @@ def test_download__skip_existing_file( download_path = temp_dir / "data.zip" make_file(download_path, ZIP_SIZE) - mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE, file_name="remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE, file_name="remote4.zip") mock_download_path.return_value = download_path # Act @@ -260,7 +262,7 @@ def urlretrieve(*_args, **_kwargs): make_file(temp_file, ZIP_SIZE + 1) return temp_file, None - mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE + 1, file_name="remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE + 1, file_name="remote5.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -292,7 +294,7 @@ def urlretrieve(*_args, **_kwargs): make_file(temp_file, ZIP_SIZE) return temp_file, None - mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE, file_name="remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE, file_name="remote6.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -360,7 +362,7 @@ def test_get_response_info__no_length(mock_urlopen): def test_get_download_path(temp_dir): # Act - path = get_download_path(dir_path=temp_dir, file_path=Path("file_name.zip"), data=b"foo") + path = get_download_path(dir_path=temp_dir, file_name="file_name.zip", unique_key="foo") # Assert assert path == temp_dir / "file_name.zip" @@ -368,29 +370,23 @@ def test_get_download_path(temp_dir): def test_get_download_path__auto_dir(): # Act - path = get_download_path(file_path=Path("file_name.zip")) + path = get_download_path(file_name="file_name.zip") # Assert - assert path == Path(".").absolute() / "file_name.zip" + assert path == Path(tempfile.gettempdir()).absolute() / "file_name.zip" def test_get_download_path__auto_file_name(temp_dir): # Arrange - foo_sha256_hash = "2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae" + # The base64 representation of the sha256 hash of "foo" is LCa0a2j/xo/5m0U8HTBBNBNCLXBkg7+g+YpeiGJm564= + # The / and + will be replaced with a _ and - character and the trailing = character(s) will be removed. + expected_file_name = "LCa0a2j_xo_5m0U8HTBBNBNCLXBkg7-g-YpeiGJm564.download" # Act - path = get_download_path(dir_path=temp_dir, data=b"foo") + path = get_download_path(dir_path=temp_dir, unique_key="foo") # Assert - assert path == temp_dir / f"{foo_sha256_hash}.download" - - -def test_get_download_path__invalid_data_type(temp_dir): - # Act / Assert - with pytest.raises( - ValueError, match=r"Can't auto generate a download path for 'str' data \(convert it to bytes first\)\." - ): - get_download_path(data="foo") # type: ignore + assert path == temp_dir / expected_file_name def test_get_download_path__missing_data(temp_dir): @@ -399,5 +395,13 @@ def test_get_download_path__missing_data(temp_dir): get_download_path(dir_path=temp_dir) +def test_get_download_path__invalid_file_path(temp_dir): + # Arrange + make_dir(temp_dir / "download") + # Act / Assert + with pytest.raises(ValueError, match=r"Invalid file path:"): + get_download_path(dir_path=temp_dir, file_name="download") + + def test_extract(): extract From 574ff5328fa2f3ab6b8a9897fdca82d03cdfdadd Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Wed, 1 Mar 2023 11:19:11 +0100 Subject: [PATCH 16/50] Add two very small zip files for unit testing Signed-off-by: Bram Stoeller --- tests/data/downloader/one-file.zip | Bin 0 -> 126 bytes tests/data/downloader/one-file.zip.license | 3 + tests/data/downloader/two-files.zip | Bin 0 -> 247 bytes tests/data/downloader/two-files.zip.license | 3 + tests/unit/utils/test_download.py | 61 ++++++++++---------- 5 files changed, 37 insertions(+), 30 deletions(-) create mode 100644 tests/data/downloader/one-file.zip create mode 100644 tests/data/downloader/one-file.zip.license create mode 100644 tests/data/downloader/two-files.zip create mode 100644 tests/data/downloader/two-files.zip.license diff --git a/tests/data/downloader/one-file.zip b/tests/data/downloader/one-file.zip new file mode 100644 index 0000000000000000000000000000000000000000..1c1130a3a8542f079d908eb6f6ece0c30122fe52 GIT binary patch literal 126 zcmWIWW@Zs#00Eca#ISwL8ToubHVCr=aaw-9UP(oXPkvEqu0m!(acQoeo?d`ABa;XN gZcRXa42%p4APNYeMg(}XvVpjaKxhi2jX@j+0Fbs5I{*Lx literal 0 HcmV?d00001 diff --git a/tests/data/downloader/one-file.zip.license b/tests/data/downloader/one-file.zip.license new file mode 100644 index 00000000..e6220e93 --- /dev/null +++ b/tests/data/downloader/one-file.zip.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project + +SPDX-License-Identifier: MPL-2.0 \ No newline at end of file diff --git a/tests/data/downloader/two-files.zip b/tests/data/downloader/two-files.zip new file mode 100644 index 0000000000000000000000000000000000000000..5f7c0598608bcbeb5f92ef6fa083c84039038b06 GIT binary patch literal 247 zcmWIWW@Zs#00FPy#IOdj7;`}&8-)3QI4wUXCACODDX~beq@qMmPcJ1uC%;IcII~0{ zF*mg&0It~usCgfAMm|Ur2(yDV=Yw?lU~k@aG;5Tpa5*A}c7Y<++?D;r3H2?z^;bODIN0079*E2jVe literal 0 HcmV?d00001 diff --git a/tests/data/downloader/two-files.zip.license b/tests/data/downloader/two-files.zip.license new file mode 100644 index 00000000..e6220e93 --- /dev/null +++ b/tests/data/downloader/two-files.zip.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project + +SPDX-License-Identifier: MPL-2.0 \ No newline at end of file diff --git a/tests/unit/utils/test_download.py b/tests/unit/utils/test_download.py index 112cb8be..99409e5d 100644 --- a/tests/unit/utils/test_download.py +++ b/tests/unit/utils/test_download.py @@ -1,6 +1,7 @@ # SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project # # SPDX-License-Identifier: MPL-2.0 +import shutil import tempfile from collections import namedtuple from pathlib import Path @@ -21,10 +22,14 @@ from ...utils import assert_log_exists -ZIP_SIZE = 1024 # 1kb - Response = namedtuple("Response", ["status", "headers"]) +DATA_DIR = Path(__file__).parents[2] / "data" / "downloader" +ZIP1 = DATA_DIR / "one-file.zip" +ZIP2 = DATA_DIR / "two-files.zip" +ZIP1_SIZE = ZIP1.stat().st_size +ZIP2_SIZE = ZIP2.stat().st_size + @pytest.fixture() def temp_dir(): @@ -32,11 +37,6 @@ def temp_dir(): yield Path(tmp) -def make_file(file_name: Path, file_size: int = 0): - with open(file_name, "wb") as fp: - fp.write(b"\0" * file_size) - - def make_dir(dirname: Path): dirname.mkdir(parents=True) @@ -143,10 +143,10 @@ def test_download( download_path = temp_dir / "data.zip" def urlretrieve(*_args, **_kwargs): - make_file(temp_file, ZIP_SIZE) + shutil.copyfile(ZIP1, temp_file) return temp_file, None - mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE, file_name="remote1.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=ZIP1_SIZE, file_name="remote.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -164,7 +164,7 @@ def urlretrieve(*_args, **_kwargs): mock_urlretrieve.assert_called_once_with(url, reporthook=mock_hook.return_value) assert result == file_path assert result.is_file() - assert result.stat().st_size == ZIP_SIZE + assert result.stat().st_size == ZIP1_SIZE @patch("power_grid_model_io.utils.download.DownloadProgressHook", new=MagicMock()) @@ -180,10 +180,10 @@ def test_download__auto_file_name( download_path = temp_dir / "data.zip" def urlretrieve(*_args, **_kwargs): - make_file(temp_file, 0) + shutil.copyfile(ZIP1, temp_file) return temp_file, None - mock_info.return_value = ResponseInfo(status=200, file_size=None, file_name="remote2.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=None, file_name="remote.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -191,7 +191,7 @@ def urlretrieve(*_args, **_kwargs): download(url=MagicMock()) # Assert - mock_download_path.assert_called_once_with(dir_path=None, file_name="remote2.zip", unique_key=ANY) + mock_download_path.assert_called_once_with(dir_path=None, file_name="remote.zip", unique_key=ANY) @patch("power_grid_model_io.utils.download.DownloadProgressHook", new=MagicMock()) @@ -207,10 +207,11 @@ def test_download__empty_file( download_path = temp_dir / "data.zip" def urlretrieve(*_args, **_kwargs): - make_file(temp_file, 0) + with open(temp_file, "wb"): + pass return temp_file, None - mock_info.return_value = ResponseInfo(status=200, file_size=None, file_name="remote3.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=None, file_name="remote.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -231,9 +232,9 @@ def test_download__skip_existing_file( ): # Arrange download_path = temp_dir / "data.zip" - make_file(download_path, ZIP_SIZE) + shutil.copyfile(ZIP1, download_path) - mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE, file_name="remote4.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=ZIP1_SIZE, file_name="remote.zip") mock_download_path.return_value = download_path # Act @@ -256,13 +257,13 @@ def test_download__update_file( # Arrange temp_file = temp_dir / "data.download" download_path = temp_dir / "data.zip" - make_file(download_path, ZIP_SIZE) + shutil.copyfile(ZIP1, download_path) def urlretrieve(*_args, **_kwargs): - make_file(temp_file, ZIP_SIZE + 1) + shutil.copyfile(ZIP2, temp_file) return temp_file, None - mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE + 1, file_name="remote5.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=ZIP2_SIZE, file_name="remote.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -274,7 +275,7 @@ def urlretrieve(*_args, **_kwargs): # Assert assert result == download_path assert result.is_file() - assert result.stat().st_size == ZIP_SIZE + 1 + assert result.stat().st_size == ZIP2_SIZE @patch("power_grid_model_io.utils.download.DownloadProgressHook", new=MagicMock()) @@ -288,13 +289,13 @@ def test_download__overwrite( # Arrange temp_file = temp_dir / "data.download" download_path = temp_dir / "data.zip" - make_file(download_path, ZIP_SIZE) + shutil.copyfile(ZIP1, download_path) def urlretrieve(*_args, **_kwargs): - make_file(temp_file, ZIP_SIZE) + shutil.copyfile(ZIP1, temp_file) return temp_file, None - mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE, file_name="remote6.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=ZIP1_SIZE, file_name="remote.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -306,7 +307,7 @@ def urlretrieve(*_args, **_kwargs): # Assert assert result == download_path assert result.is_file() - assert result.stat().st_size == ZIP_SIZE + assert result.stat().st_size == ZIP1_SIZE @patch("power_grid_model_io.utils.download.get_response_info") @@ -360,7 +361,7 @@ def test_get_response_info__no_length(mock_urlopen): assert get_response_info("") == ResponseInfo(status=123, file_size=None, file_name="filename.zip") -def test_get_download_path(temp_dir): +def test_get_download_path(temp_dir: Path): # Act path = get_download_path(dir_path=temp_dir, file_name="file_name.zip", unique_key="foo") @@ -376,7 +377,7 @@ def test_get_download_path__auto_dir(): assert path == Path(tempfile.gettempdir()).absolute() / "file_name.zip" -def test_get_download_path__auto_file_name(temp_dir): +def test_get_download_path__auto_file_name(temp_dir: Path): # Arrange # The base64 representation of the sha256 hash of "foo" is LCa0a2j/xo/5m0U8HTBBNBNCLXBkg7+g+YpeiGJm564= # The / and + will be replaced with a _ and - character and the trailing = character(s) will be removed. @@ -389,13 +390,13 @@ def test_get_download_path__auto_file_name(temp_dir): assert path == temp_dir / expected_file_name -def test_get_download_path__missing_data(temp_dir): +def test_get_download_path__missing_data(temp_dir: Path): # Act / Assert with pytest.raises(ValueError, match=r"Supply data in order to auto generate a download path\."): get_download_path(dir_path=temp_dir) -def test_get_download_path__invalid_file_path(temp_dir): +def test_get_download_path__invalid_file_path(temp_dir: Path): # Arrange make_dir(temp_dir / "download") # Act / Assert @@ -403,5 +404,5 @@ def test_get_download_path__invalid_file_path(temp_dir): get_download_path(dir_path=temp_dir, file_name="download") -def test_extract(): +def test_extract(temp_dir: Path): extract From a7355502ca2f04e7f24748aeadecaa598cc46e8b Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Wed, 1 Mar 2023 12:06:44 +0100 Subject: [PATCH 17/50] Rename example zip files Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 42 +++++++---- .../downloader/{two-files.zip => foo-bar.zip} | Bin ...e-file.zip.license => foo-bar.zip.license} | 0 tests/data/downloader/foo.zip | Bin 0 -> 134 bytes ...{two-files.zip.license => foo.zip.license} | 0 tests/data/downloader/one-file.zip | Bin 126 -> 0 bytes tests/unit/utils/test_download.py | 70 +++++++++++++++--- tests/utils.py | 13 ++++ 8 files changed, 97 insertions(+), 28 deletions(-) rename tests/data/downloader/{two-files.zip => foo-bar.zip} (100%) rename tests/data/downloader/{one-file.zip.license => foo-bar.zip.license} (100%) create mode 100644 tests/data/downloader/foo.zip rename tests/data/downloader/{two-files.zip.license => foo.zip.license} (100%) delete mode 100644 tests/data/downloader/one-file.zip diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index 76ef25f4..40c6db58 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -3,6 +3,17 @@ # SPDX-License-Identifier: MPL-2.0 """ Helper functions to download (and store) files from the internet + +The most simple (and intended) usage is: +url = "http://141.51.193.167/simbench/gui/usecase/download/?simbench_code=1-complete_data-mixed-all-0-sw&format=csv" +csv_dir = download_and_extract(url) + +It will download the zip file 1-complete_data-mixed-all-0-sw.zip to a folder in you systems temp dir; for example +"/tmp/1-complete_data-mixed-all-0-sw.zip". Then it extracts the files there as well, in a folder that corresponds to +the zip file name ("/tmp/1-complete_data-mixed-all-0-sw/" in our example), and it returns the path to that directory. +By default, it will not re-download or re-extract the zip file as long as the files exist in you temp dir. Your temp +dir is typically emptied whe you reboot your computer. + """ import base64 @@ -222,7 +233,7 @@ def get_download_path( return file_path -def extract(src_file_path: Path, dst_dir_path: Optional[Path] = None, skip_if_exists=True) -> Path: +def extract(src_file_path: Path, dst_dir_path: Optional[Path] = None, skip_if_exists=False) -> Path: """ Extract a .zip file and return the destination dir @@ -246,21 +257,20 @@ def extract(src_file_path: Path, dst_dir_path: Optional[Path] = None, skip_if_ex if not dst_dir_path.is_dir(): raise NotADirectoryError(f"Destination dir {dst_dir_path} exists and is not a directory") - else: - # Create the destination directory - dst_dir_path.mkdir(parents=True, exist_ok=True) - - # Extract per file, so we can show a progress bar - with zipfile.ZipFile(src_file_path, "r") as zip_file: - file_list = zip_file.namelist() - for file_path in tqdm(desc="Extracting", iterable=file_list, total=len(file_list), unit="file", leave=True): - dst_file_path = dst_dir_path / file_path - if dst_file_path.exists() and dst_file_path.stat().st_size > 0: - if skip_if_exists: - log.debug("Skip file extraction, destination file exists", dst_file_path=dst_file_path) - continue - raise FileExistsError(f"Destination file {dst_dir_path / file_path} exists and is not empty") - zip_file.extract(member=file_path, path=dst_dir_path) + # Create the destination directory + dst_dir_path.mkdir(parents=True, exist_ok=True) + + # Extract per file, so we can show a progress bar + with zipfile.ZipFile(src_file_path, "r") as zip_file: + file_list = zip_file.namelist() + for file_path in tqdm(desc="Extracting", iterable=file_list, total=len(file_list), unit="file", leave=True): + dst_file_path = dst_dir_path / file_path + if dst_file_path.exists() and dst_file_path.stat().st_size > 0: + if skip_if_exists: + log.debug("Skip file extraction, destination file exists", dst_file_path=dst_file_path) + continue + raise FileExistsError(f"Destination file {dst_dir_path / file_path} exists and is not empty") + zip_file.extract(member=file_path, path=dst_dir_path) # If the zip files contains a single directory with the same name as the zip file, return that dir contents = list(dst_dir_path.iterdir()) diff --git a/tests/data/downloader/two-files.zip b/tests/data/downloader/foo-bar.zip similarity index 100% rename from tests/data/downloader/two-files.zip rename to tests/data/downloader/foo-bar.zip diff --git a/tests/data/downloader/one-file.zip.license b/tests/data/downloader/foo-bar.zip.license similarity index 100% rename from tests/data/downloader/one-file.zip.license rename to tests/data/downloader/foo-bar.zip.license diff --git a/tests/data/downloader/foo.zip b/tests/data/downloader/foo.zip new file mode 100644 index 0000000000000000000000000000000000000000..8e13ab608434d4387c6fe15626003c35f659f68f GIT binary patch literal 134 zcmWIWW@Zs#00Eca#ISwL8ToubHVAVAaaw-9J`n4bRFwGS7p3MZWEK>c=IZI`1$Z+u ji7?>S2-M5K$e;kCfB?yu0B=?{kPssfS^{Zv5QhN(qP!L{ literal 0 HcmV?d00001 diff --git a/tests/data/downloader/two-files.zip.license b/tests/data/downloader/foo.zip.license similarity index 100% rename from tests/data/downloader/two-files.zip.license rename to tests/data/downloader/foo.zip.license diff --git a/tests/data/downloader/one-file.zip b/tests/data/downloader/one-file.zip deleted file mode 100644 index 1c1130a3a8542f079d908eb6f6ece0c30122fe52..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 126 zcmWIWW@Zs#00Eca#ISwL8ToubHVCr=aaw-9UP(oXPkvEqu0m!(acQoeo?d`ABa;XN gZcRXa42%p4APNYeMg(}XvVpjaKxhi2jX@j+0Fbs5I{*Lx diff --git a/tests/unit/utils/test_download.py b/tests/unit/utils/test_download.py index 99409e5d..2f2d993d 100644 --- a/tests/unit/utils/test_download.py +++ b/tests/unit/utils/test_download.py @@ -20,13 +20,13 @@ get_response_info, ) -from ...utils import assert_log_exists +from ...utils import MockTqdm, assert_log_exists Response = namedtuple("Response", ["status", "headers"]) DATA_DIR = Path(__file__).parents[2] / "data" / "downloader" -ZIP1 = DATA_DIR / "one-file.zip" -ZIP2 = DATA_DIR / "two-files.zip" +ZIP1 = DATA_DIR / "foo.zip" +ZIP2 = DATA_DIR / "foo-bar.zip" ZIP1_SIZE = ZIP1.stat().st_size ZIP2_SIZE = ZIP2.stat().st_size @@ -37,10 +37,6 @@ def temp_dir(): yield Path(tmp) -def make_dir(dirname: Path): - dirname.mkdir(parents=True) - - def test_progress_hook(): # Arrange progress_bar = MagicMock() @@ -111,7 +107,7 @@ def test_download_and_extract__overwrite(mock_download: MagicMock, temp_dir: Pat mock_download.return_value = src_file_path dst_dir_path = temp_dir / "data" - make_dir(dst_dir_path) + dst_dir_path.mkdir() # Act / Assert download_and_extract(url=MagicMock(), overwrite=False) @@ -150,7 +146,7 @@ def urlretrieve(*_args, **_kwargs): mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve - # Act / assert + # Act / Assert with structlog.testing.capture_logs() as capture: result = download(url=url, file_name=file_path, dir_path=dir_path) assert_log_exists(capture, "debug", "Downloading file") @@ -398,11 +394,61 @@ def test_get_download_path__missing_data(temp_dir: Path): def test_get_download_path__invalid_file_path(temp_dir: Path): # Arrange - make_dir(temp_dir / "download") + (temp_dir / "download").mkdir() + # Act / Assert with pytest.raises(ValueError, match=r"Invalid file path:"): get_download_path(dir_path=temp_dir, file_name="download") -def test_extract(temp_dir: Path): - extract +@patch("power_grid_model_io.utils.download.tqdm") +def test_extract(mock_tqdm: MagicMock, temp_dir: Path): + # Arrange + src_file_path = temp_dir / "compressed.zip" + dst_dir_path = temp_dir / "extracted" + shutil.copyfile(ZIP2, src_file_path) + mock_tqdm.side_effect = MockTqdm + + # Act + extract(src_file_path=src_file_path, dst_dir_path=dst_dir_path) + + # Assert + assert (dst_dir_path / "foo.txt").is_file() + assert (dst_dir_path / "folder/bar.txt").is_file() + + +@patch("power_grid_model_io.utils.download.tqdm") +def test_extract__file_exists(mock_tqdm: MagicMock, temp_dir: Path): + # Arrange + src_file_path = temp_dir / "compressed.zip" + dst_dir_path = temp_dir / "extracted" + shutil.copyfile(ZIP2, src_file_path) + mock_tqdm.side_effect = MockTqdm + + dst_dir_path.mkdir() + with open(dst_dir_path / "foo.txt", "wb") as fp: + fp.write(b"\0") + + # Act / Assert + with pytest.raises(FileExistsError, match=r"Destination file .*foo\.txt exists and is not empty"): + extract(src_file_path=src_file_path, dst_dir_path=dst_dir_path) + + +@patch("power_grid_model_io.utils.download.tqdm") +def test_extract__skip_if_exists(mock_tqdm: MagicMock, temp_dir: Path): + # Arrange + src_file_path = temp_dir / "compressed.zip" + dst_dir_path = temp_dir / "extracted" + shutil.copyfile(ZIP2, src_file_path) + mock_tqdm.side_effect = MockTqdm + + dst_dir_path.mkdir() + with open(dst_dir_path / "foo.txt", "wb") as fp: + fp.write(b"\0") + + # Act / Assert + with structlog.testing.capture_logs() as capture: + extract(src_file_path=src_file_path, dst_dir_path=dst_dir_path, skip_if_exists=True) + assert_log_exists( + capture, "debug", "Skip file extraction, destination file exists", dst_file_path=dst_dir_path / "foo.txt" + ) diff --git a/tests/utils.py b/tests/utils.py index 045686c7..e6cd4547 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -268,3 +268,16 @@ def sheet_names(self) -> List[str]: def parse(self, sheet_name: str, **_kwargs) -> pd.DataFrame: return self.data[sheet_name] + + + + +class MockTqdm: + """To use: for x in tqdm(iterable)""" + + def __init__(self, iterable=None, **kwargs): + self.iterable = iterable + + def __iter__(self): + for item in self.iterable: + yield item \ No newline at end of file From 108810aceb337e572eb895111377998e9ca0b40a Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Wed, 1 Mar 2023 13:52:03 +0100 Subject: [PATCH 18/50] Unit test for extract() Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 24 ++++++++++----- tests/unit/utils/test_download.py | 36 +++++++++++++++++++++-- 2 files changed, 51 insertions(+), 9 deletions(-) diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index 40c6db58..e8d40ead 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -224,13 +224,13 @@ def get_download_path( # Combine the two paths assert file_name is not None - file_path = (dir_path / file_name).resolve() if dir_path else Path(file_name).resolve() + file_path = (dir_path / file_name) if dir_path else Path(file_name) # If the file_path exists, it should be a file (not a dir) if file_path.exists() and not file_path.is_file(): raise ValueError(f"Invalid file path: {file_path}") - return file_path + return file_path.resolve() def extract(src_file_path: Path, dst_dir_path: Optional[Path] = None, skip_if_exists=False) -> Path: @@ -272,9 +272,19 @@ def extract(src_file_path: Path, dst_dir_path: Optional[Path] = None, skip_if_ex raise FileExistsError(f"Destination file {dst_dir_path / file_path} exists and is not empty") zip_file.extract(member=file_path, path=dst_dir_path) - # If the zip files contains a single directory with the same name as the zip file, return that dir - contents = list(dst_dir_path.iterdir()) - if len(contents) == 1 and contents[0].is_dir() and contents[0].name == src_file_path.stem: - dst_dir_path = contents[0] + # Zip files often contain a single directory with the same name as the zip file. + # In that case, return the dir to that directory instead of the root dir + only_item: Optional[Path] = None + for item in dst_dir_path.iterdir(): + # If only_item is None, this is the first iteration, so item may be the only item + if only_item is None: + only_item = item + # Else, if only_item is not None, there are more than one items in the root of the directory. + # This means hat there is no 'only_item' and we can stop the loop + else: + only_item = None + break + if only_item and only_item.is_dir() and only_item.name == src_file_path.stem: + dst_dir_path = only_item - return dst_dir_path + return dst_dir_path.resolve() diff --git a/tests/unit/utils/test_download.py b/tests/unit/utils/test_download.py index 2f2d993d..7b70353f 100644 --- a/tests/unit/utils/test_download.py +++ b/tests/unit/utils/test_download.py @@ -410,13 +410,30 @@ def test_extract(mock_tqdm: MagicMock, temp_dir: Path): mock_tqdm.side_effect = MockTqdm # Act - extract(src_file_path=src_file_path, dst_dir_path=dst_dir_path) + extract_dir_path = extract(src_file_path=src_file_path, dst_dir_path=dst_dir_path) # Assert + assert extract_dir_path == dst_dir_path assert (dst_dir_path / "foo.txt").is_file() assert (dst_dir_path / "folder/bar.txt").is_file() +@patch("power_grid_model_io.utils.download.tqdm") +def test_extract__auto_dir(mock_tqdm: MagicMock, temp_dir: Path): + # Arrange + src_file_path = temp_dir / "compressed.zip" + shutil.copyfile(ZIP2, src_file_path) + mock_tqdm.side_effect = MockTqdm + + # Act + extract_dir_path = extract(src_file_path=src_file_path) + + # Assert + assert extract_dir_path == temp_dir / "compressed" + assert (temp_dir / "compressed" / "foo.txt").is_file() + assert (temp_dir / "compressed" / "folder" / "bar.txt").is_file() + + @patch("power_grid_model_io.utils.download.tqdm") def test_extract__file_exists(mock_tqdm: MagicMock, temp_dir: Path): # Arrange @@ -438,7 +455,7 @@ def test_extract__file_exists(mock_tqdm: MagicMock, temp_dir: Path): def test_extract__skip_if_exists(mock_tqdm: MagicMock, temp_dir: Path): # Arrange src_file_path = temp_dir / "compressed.zip" - dst_dir_path = temp_dir / "extracted" + dst_dir_path = temp_dir / "compressed" shutil.copyfile(ZIP2, src_file_path) mock_tqdm.side_effect = MockTqdm @@ -452,3 +469,18 @@ def test_extract__skip_if_exists(mock_tqdm: MagicMock, temp_dir: Path): assert_log_exists( capture, "debug", "Skip file extraction, destination file exists", dst_file_path=dst_dir_path / "foo.txt" ) + + +@patch("power_grid_model_io.utils.download.tqdm") +def test_extract__return_subdir_path(mock_tqdm: MagicMock, temp_dir: Path): + # Arrange + src_file_path = temp_dir / "foo.zip" + shutil.copyfile(ZIP1, src_file_path) + mock_tqdm.side_effect = MockTqdm + + # Act + extract_dir_path = extract(src_file_path=src_file_path) + + # Assert + assert extract_dir_path == temp_dir / "foo" / "foo" + assert (temp_dir / "foo" / "foo" / "foo.txt").is_file() From 08a80fa72e1f4c153deb3a7863a8767273eb57f7 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Wed, 1 Mar 2023 14:49:47 +0100 Subject: [PATCH 19/50] Split download en zip_files util Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 76 +++---------- tests/data/downloader/foo-bar.zip | Bin 247 -> 0 bytes tests/data/downloader/foo-bar.zip.license | 3 - tests/data/downloader/foo.zip | Bin 134 -> 0 bytes tests/data/downloader/foo.zip.license | 3 - tests/unit/utils/test_download.py | 128 ++++------------------ 6 files changed, 33 insertions(+), 177 deletions(-) delete mode 100644 tests/data/downloader/foo-bar.zip delete mode 100644 tests/data/downloader/foo-bar.zip.license delete mode 100644 tests/data/downloader/foo.zip delete mode 100644 tests/data/downloader/foo.zip.license diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index e8d40ead..28912903 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -6,13 +6,19 @@ The most simple (and intended) usage is: url = "http://141.51.193.167/simbench/gui/usecase/download/?simbench_code=1-complete_data-mixed-all-0-sw&format=csv" -csv_dir = download_and_extract(url) +zip_file_path = download(url) It will download the zip file 1-complete_data-mixed-all-0-sw.zip to a folder in you systems temp dir; for example -"/tmp/1-complete_data-mixed-all-0-sw.zip". Then it extracts the files there as well, in a folder that corresponds to -the zip file name ("/tmp/1-complete_data-mixed-all-0-sw/" in our example), and it returns the path to that directory. -By default, it will not re-download or re-extract the zip file as long as the files exist in you temp dir. Your temp -dir is typically emptied whe you reboot your computer. +"/tmp/1-complete_data-mixed-all-0-sw.zip". + +Another convenience function is download_and_extract(): + +csv_dir_path = download_and_extract(url) + +This downloads the zip file as described above, and then it extracts the files there as well, in a folder which +corresponds to the zip file name ("/tmp/1-complete_data-mixed-all-0-sw/" in our example), and it returns the path to +that directory. By default, it will not re-download or re-extract the zip file as long as the files exist in your +temp dir. Your temp dir is typically emptied whe you reboot your computer. """ @@ -20,7 +26,6 @@ import hashlib import re import tempfile -import zipfile from dataclasses import dataclass from pathlib import Path from shutil import rmtree as remove_dir @@ -30,6 +35,8 @@ import structlog from tqdm import tqdm +from power_grid_model_io.utils.zip_files import extract + _log = structlog.get_logger(__name__) @@ -231,60 +238,3 @@ def get_download_path( raise ValueError(f"Invalid file path: {file_path}") return file_path.resolve() - - -def extract(src_file_path: Path, dst_dir_path: Optional[Path] = None, skip_if_exists=False) -> Path: - """ - Extract a .zip file and return the destination dir - - Args: - src_file_path: The .zip file to extract. - dst_dir_path: An optional destination path. If none is given, the src_file_path without .zip extension is used. - skip_if_exists: Skip existing files, otherwise raise an exception when a file exists. - - Returns: The path where the files are extracted - - """ - if src_file_path.suffix.lower() != ".zip": - raise ValueError(f"Only files with .zip extension are supported, got {src_file_path}") - - if dst_dir_path is None: - dst_dir_path = src_file_path.with_suffix("") - - log = _log.bind(src_file_path=src_file_path, dst_dir_path=dst_dir_path) - - if dst_dir_path.exists(): - if not dst_dir_path.is_dir(): - raise NotADirectoryError(f"Destination dir {dst_dir_path} exists and is not a directory") - - # Create the destination directory - dst_dir_path.mkdir(parents=True, exist_ok=True) - - # Extract per file, so we can show a progress bar - with zipfile.ZipFile(src_file_path, "r") as zip_file: - file_list = zip_file.namelist() - for file_path in tqdm(desc="Extracting", iterable=file_list, total=len(file_list), unit="file", leave=True): - dst_file_path = dst_dir_path / file_path - if dst_file_path.exists() and dst_file_path.stat().st_size > 0: - if skip_if_exists: - log.debug("Skip file extraction, destination file exists", dst_file_path=dst_file_path) - continue - raise FileExistsError(f"Destination file {dst_dir_path / file_path} exists and is not empty") - zip_file.extract(member=file_path, path=dst_dir_path) - - # Zip files often contain a single directory with the same name as the zip file. - # In that case, return the dir to that directory instead of the root dir - only_item: Optional[Path] = None - for item in dst_dir_path.iterdir(): - # If only_item is None, this is the first iteration, so item may be the only item - if only_item is None: - only_item = item - # Else, if only_item is not None, there are more than one items in the root of the directory. - # This means hat there is no 'only_item' and we can stop the loop - else: - only_item = None - break - if only_item and only_item.is_dir() and only_item.name == src_file_path.stem: - dst_dir_path = only_item - - return dst_dir_path.resolve() diff --git a/tests/data/downloader/foo-bar.zip b/tests/data/downloader/foo-bar.zip deleted file mode 100644 index 5f7c0598608bcbeb5f92ef6fa083c84039038b06..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 247 zcmWIWW@Zs#00FPy#IOdj7;`}&8-)3QI4wUXCACODDX~beq@qMmPcJ1uC%;IcII~0{ zF*mg&0It~usCgfAMm|Ur2(yDV=Yw?lU~k@aG;5Tpa5*A}c7Y<++?D;r3H2?z^;bODIN0079*E2jVe diff --git a/tests/data/downloader/foo-bar.zip.license b/tests/data/downloader/foo-bar.zip.license deleted file mode 100644 index e6220e93..00000000 --- a/tests/data/downloader/foo-bar.zip.license +++ /dev/null @@ -1,3 +0,0 @@ -SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project - -SPDX-License-Identifier: MPL-2.0 \ No newline at end of file diff --git a/tests/data/downloader/foo.zip b/tests/data/downloader/foo.zip deleted file mode 100644 index 8e13ab608434d4387c6fe15626003c35f659f68f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 134 zcmWIWW@Zs#00Eca#ISwL8ToubHVAVAaaw-9J`n4bRFwGS7p3MZWEK>c=IZI`1$Z+u ji7?>S2-M5K$e;kCfB?yu0B=?{kPssfS^{Zv5QhN(qP!L{ diff --git a/tests/data/downloader/foo.zip.license b/tests/data/downloader/foo.zip.license deleted file mode 100644 index e6220e93..00000000 --- a/tests/data/downloader/foo.zip.license +++ /dev/null @@ -1,3 +0,0 @@ -SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project - -SPDX-License-Identifier: MPL-2.0 \ No newline at end of file diff --git a/tests/unit/utils/test_download.py b/tests/unit/utils/test_download.py index 7b70353f..c4593aa7 100644 --- a/tests/unit/utils/test_download.py +++ b/tests/unit/utils/test_download.py @@ -1,7 +1,6 @@ # SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project # # SPDX-License-Identifier: MPL-2.0 -import shutil import tempfile from collections import namedtuple from pathlib import Path @@ -15,21 +14,14 @@ ResponseInfo, download, download_and_extract, - extract, get_download_path, get_response_info, ) -from ...utils import MockTqdm, assert_log_exists +from ...utils import assert_log_exists Response = namedtuple("Response", ["status", "headers"]) -DATA_DIR = Path(__file__).parents[2] / "data" / "downloader" -ZIP1 = DATA_DIR / "foo.zip" -ZIP2 = DATA_DIR / "foo-bar.zip" -ZIP1_SIZE = ZIP1.stat().st_size -ZIP2_SIZE = ZIP2.stat().st_size - @pytest.fixture() def temp_dir(): @@ -37,6 +29,11 @@ def temp_dir(): yield Path(tmp) +def make_file(file_path: Path, file_size: int = 0): + with open(file_path, "wb") as fp: + fp.write(b"\0" * file_size) + + def test_progress_hook(): # Arrange progress_bar = MagicMock() @@ -139,10 +136,10 @@ def test_download( download_path = temp_dir / "data.zip" def urlretrieve(*_args, **_kwargs): - shutil.copyfile(ZIP1, temp_file) + make_file(temp_file, 100) return temp_file, None - mock_info.return_value = ResponseInfo(status=200, file_size=ZIP1_SIZE, file_name="remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=100, file_name="remote.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -160,7 +157,7 @@ def urlretrieve(*_args, **_kwargs): mock_urlretrieve.assert_called_once_with(url, reporthook=mock_hook.return_value) assert result == file_path assert result.is_file() - assert result.stat().st_size == ZIP1_SIZE + assert result.stat().st_size == 100 @patch("power_grid_model_io.utils.download.DownloadProgressHook", new=MagicMock()) @@ -176,7 +173,7 @@ def test_download__auto_file_name( download_path = temp_dir / "data.zip" def urlretrieve(*_args, **_kwargs): - shutil.copyfile(ZIP1, temp_file) + make_file(temp_file, 100) return temp_file, None mock_info.return_value = ResponseInfo(status=200, file_size=None, file_name="remote.zip") @@ -228,9 +225,9 @@ def test_download__skip_existing_file( ): # Arrange download_path = temp_dir / "data.zip" - shutil.copyfile(ZIP1, download_path) + make_file(download_path, 100) - mock_info.return_value = ResponseInfo(status=200, file_size=ZIP1_SIZE, file_name="remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=100, file_name="remote.zip") mock_download_path.return_value = download_path # Act @@ -253,13 +250,13 @@ def test_download__update_file( # Arrange temp_file = temp_dir / "data.download" download_path = temp_dir / "data.zip" - shutil.copyfile(ZIP1, download_path) + make_file(download_path, 100) def urlretrieve(*_args, **_kwargs): - shutil.copyfile(ZIP2, temp_file) + make_file(temp_file, 101) return temp_file, None - mock_info.return_value = ResponseInfo(status=200, file_size=ZIP2_SIZE, file_name="remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=101, file_name="remote.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -271,7 +268,7 @@ def urlretrieve(*_args, **_kwargs): # Assert assert result == download_path assert result.is_file() - assert result.stat().st_size == ZIP2_SIZE + assert result.stat().st_size == 101 @patch("power_grid_model_io.utils.download.DownloadProgressHook", new=MagicMock()) @@ -285,13 +282,13 @@ def test_download__overwrite( # Arrange temp_file = temp_dir / "data.download" download_path = temp_dir / "data.zip" - shutil.copyfile(ZIP1, download_path) + make_file(download_path, 100) def urlretrieve(*_args, **_kwargs): - shutil.copyfile(ZIP1, temp_file) + make_file(temp_file, 100) return temp_file, None - mock_info.return_value = ResponseInfo(status=200, file_size=ZIP1_SIZE, file_name="remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=100, file_name="remote.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -303,7 +300,7 @@ def urlretrieve(*_args, **_kwargs): # Assert assert result == download_path assert result.is_file() - assert result.stat().st_size == ZIP1_SIZE + assert result.stat().st_size == 100 @patch("power_grid_model_io.utils.download.get_response_info") @@ -399,88 +396,3 @@ def test_get_download_path__invalid_file_path(temp_dir: Path): # Act / Assert with pytest.raises(ValueError, match=r"Invalid file path:"): get_download_path(dir_path=temp_dir, file_name="download") - - -@patch("power_grid_model_io.utils.download.tqdm") -def test_extract(mock_tqdm: MagicMock, temp_dir: Path): - # Arrange - src_file_path = temp_dir / "compressed.zip" - dst_dir_path = temp_dir / "extracted" - shutil.copyfile(ZIP2, src_file_path) - mock_tqdm.side_effect = MockTqdm - - # Act - extract_dir_path = extract(src_file_path=src_file_path, dst_dir_path=dst_dir_path) - - # Assert - assert extract_dir_path == dst_dir_path - assert (dst_dir_path / "foo.txt").is_file() - assert (dst_dir_path / "folder/bar.txt").is_file() - - -@patch("power_grid_model_io.utils.download.tqdm") -def test_extract__auto_dir(mock_tqdm: MagicMock, temp_dir: Path): - # Arrange - src_file_path = temp_dir / "compressed.zip" - shutil.copyfile(ZIP2, src_file_path) - mock_tqdm.side_effect = MockTqdm - - # Act - extract_dir_path = extract(src_file_path=src_file_path) - - # Assert - assert extract_dir_path == temp_dir / "compressed" - assert (temp_dir / "compressed" / "foo.txt").is_file() - assert (temp_dir / "compressed" / "folder" / "bar.txt").is_file() - - -@patch("power_grid_model_io.utils.download.tqdm") -def test_extract__file_exists(mock_tqdm: MagicMock, temp_dir: Path): - # Arrange - src_file_path = temp_dir / "compressed.zip" - dst_dir_path = temp_dir / "extracted" - shutil.copyfile(ZIP2, src_file_path) - mock_tqdm.side_effect = MockTqdm - - dst_dir_path.mkdir() - with open(dst_dir_path / "foo.txt", "wb") as fp: - fp.write(b"\0") - - # Act / Assert - with pytest.raises(FileExistsError, match=r"Destination file .*foo\.txt exists and is not empty"): - extract(src_file_path=src_file_path, dst_dir_path=dst_dir_path) - - -@patch("power_grid_model_io.utils.download.tqdm") -def test_extract__skip_if_exists(mock_tqdm: MagicMock, temp_dir: Path): - # Arrange - src_file_path = temp_dir / "compressed.zip" - dst_dir_path = temp_dir / "compressed" - shutil.copyfile(ZIP2, src_file_path) - mock_tqdm.side_effect = MockTqdm - - dst_dir_path.mkdir() - with open(dst_dir_path / "foo.txt", "wb") as fp: - fp.write(b"\0") - - # Act / Assert - with structlog.testing.capture_logs() as capture: - extract(src_file_path=src_file_path, dst_dir_path=dst_dir_path, skip_if_exists=True) - assert_log_exists( - capture, "debug", "Skip file extraction, destination file exists", dst_file_path=dst_dir_path / "foo.txt" - ) - - -@patch("power_grid_model_io.utils.download.tqdm") -def test_extract__return_subdir_path(mock_tqdm: MagicMock, temp_dir: Path): - # Arrange - src_file_path = temp_dir / "foo.zip" - shutil.copyfile(ZIP1, src_file_path) - mock_tqdm.side_effect = MockTqdm - - # Act - extract_dir_path = extract(src_file_path=src_file_path) - - # Assert - assert extract_dir_path == temp_dir / "foo" / "foo" - assert (temp_dir / "foo" / "foo" / "foo.txt").is_file() From f86c7e1b1db75cdafd16f05dea0cad85774adc39 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Wed, 1 Mar 2023 15:07:16 +0100 Subject: [PATCH 20/50] Split download en zip_files util Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/zip_files.py | 79 ++++++++++++++ tests/unit/utils/test_zip_files.py | 115 +++++++++++++++++++++ 2 files changed, 194 insertions(+) create mode 100644 src/power_grid_model_io/utils/zip_files.py create mode 100644 tests/unit/utils/test_zip_files.py diff --git a/src/power_grid_model_io/utils/zip_files.py b/src/power_grid_model_io/utils/zip_files.py new file mode 100644 index 00000000..6d5c45a5 --- /dev/null +++ b/src/power_grid_model_io/utils/zip_files.py @@ -0,0 +1,79 @@ +# SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project +# +# SPDX-License-Identifier: MPL-2.0 +""" +Helper function to extract zip files + +csv_dir_path = extract("/tmp/1-complete_data-mixed-all-0-sw.zip") + +This extracts the files, in a folder which corresponds to the zip file name ("/tmp/1-complete_data-mixed-all-0-sw/" in +our example), and it returns the path to that directory. By default, it will not re-download or re-extract the zip +file as long as the files exist. + +""" + +import zipfile +from pathlib import Path +from typing import Optional + +import structlog +from tqdm import tqdm + +_log = structlog.get_logger(__name__) + + +def extract(src_file_path: Path, dst_dir_path: Optional[Path] = None, skip_if_exists=False) -> Path: + """ + Extract a .zip file and return the destination dir + + Args: + src_file_path: The .zip file to extract. + dst_dir_path: An optional destination path. If none is given, the src_file_path without .zip extension is used. + skip_if_exists: Skip existing files, otherwise raise an exception when a file exists. + + Returns: The path where the files are extracted + + """ + if src_file_path.suffix.lower() != ".zip": + raise ValueError(f"Only files with .zip extension are supported, got {src_file_path.name}") + + if dst_dir_path is None: + dst_dir_path = src_file_path.with_suffix("") + + log = _log.bind(src_file_path=src_file_path, dst_dir_path=dst_dir_path) + + if dst_dir_path.exists(): + if not dst_dir_path.is_dir(): + raise NotADirectoryError(f"Destination dir {dst_dir_path} exists and is not a directory") + + # Create the destination directory + dst_dir_path.mkdir(parents=True, exist_ok=True) + + # Extract per file, so we can show a progress bar + with zipfile.ZipFile(src_file_path, "r") as zip_file: + file_list = zip_file.namelist() + for file_path in tqdm(desc="Extracting", iterable=file_list, total=len(file_list), unit="file", leave=True): + dst_file_path = dst_dir_path / file_path + if dst_file_path.exists() and dst_file_path.stat().st_size > 0: + if skip_if_exists: + log.debug("Skip file extraction, destination file exists", dst_file_path=dst_file_path) + continue + raise FileExistsError(f"Destination file {dst_dir_path / file_path} exists and is not empty") + zip_file.extract(member=file_path, path=dst_dir_path) + + # Zip files often contain a single directory with the same name as the zip file. + # In that case, return the dir to that directory instead of the root dir + only_item: Optional[Path] = None + for item in dst_dir_path.iterdir(): + # If only_item is None, this is the first iteration, so item may be the only item + if only_item is None: + only_item = item + # Else, if only_item is not None, there are more than one items in the root of the directory. + # This means hat there is no 'only_item' and we can stop the loop + else: + only_item = None + break + if only_item and only_item.is_dir() and only_item.name == src_file_path.stem: + dst_dir_path = only_item + + return dst_dir_path.resolve() diff --git a/tests/unit/utils/test_zip_files.py b/tests/unit/utils/test_zip_files.py new file mode 100644 index 00000000..e6b79c6b --- /dev/null +++ b/tests/unit/utils/test_zip_files.py @@ -0,0 +1,115 @@ +# SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project +# +# SPDX-License-Identifier: MPL-2.0 +import shutil +import tempfile +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest +import structlog.testing + +from power_grid_model_io.utils.zip_files import extract + +from ...utils import MockTqdm, assert_log_exists + +DATA_DIR = Path(__file__).parents[2] / "data" / "zip_files" +ZIP1 = DATA_DIR / "foo.zip" +ZIP2 = DATA_DIR / "foo-bar.zip" + + +@pytest.fixture() +def temp_dir(): + with tempfile.TemporaryDirectory() as tmp: + yield Path(tmp) + + +@patch("power_grid_model_io.utils.download.tqdm") +def test_extract(mock_tqdm: MagicMock, temp_dir: Path): + # Arrange + src_file_path = temp_dir / "compressed.zip" + dst_dir_path = temp_dir / "extracted" + shutil.copyfile(ZIP2, src_file_path) + mock_tqdm.side_effect = MockTqdm + + # Act + extract_dir_path = extract(src_file_path=src_file_path, dst_dir_path=dst_dir_path) + + # Assert + assert extract_dir_path == dst_dir_path + assert (dst_dir_path / "foo.txt").is_file() + assert (dst_dir_path / "folder/bar.txt").is_file() + + +@patch("power_grid_model_io.utils.download.tqdm") +def test_extract__auto_dir(mock_tqdm: MagicMock, temp_dir: Path): + # Arrange + src_file_path = temp_dir / "compressed.zip" + shutil.copyfile(ZIP2, src_file_path) + mock_tqdm.side_effect = MockTqdm + + # Act + extract_dir_path = extract(src_file_path=src_file_path) + + # Assert + assert extract_dir_path == temp_dir / "compressed" + assert (temp_dir / "compressed" / "foo.txt").is_file() + assert (temp_dir / "compressed" / "folder" / "bar.txt").is_file() + + +def test_extract__invalid_file_extension(): + # Act / Assert + with pytest.raises(ValueError, match=r"Only files with \.zip extension are supported, got tempfile\.download"): + extract(src_file_path=Path("/tmp/dir/tempfile.download")) + + +@patch("power_grid_model_io.utils.download.tqdm") +def test_extract__file_exists(mock_tqdm: MagicMock, temp_dir: Path): + # Arrange + src_file_path = temp_dir / "compressed.zip" + dst_dir_path = temp_dir / "extracted" + shutil.copyfile(ZIP2, src_file_path) + mock_tqdm.side_effect = MockTqdm + + dst_dir_path.mkdir() + with open(dst_dir_path / "foo.txt", "wb") as fp: + fp.write(b"\0") + + # Act / Assert + with pytest.raises(FileExistsError, match=r"Destination file .*foo\.txt exists and is not empty"): + extract(src_file_path=src_file_path, dst_dir_path=dst_dir_path) + + +@patch("power_grid_model_io.utils.download.tqdm") +def test_extract__skip_if_exists(mock_tqdm: MagicMock, temp_dir: Path): + # Arrange + src_file_path = temp_dir / "compressed.zip" + dst_dir_path = temp_dir / "compressed" + shutil.copyfile(ZIP2, src_file_path) + mock_tqdm.side_effect = MockTqdm + + dst_dir_path.mkdir() + with open(dst_dir_path / "foo.txt", "wb") as fp: + fp.write(b"\0") + + # Act / Assert + with structlog.testing.capture_logs() as capture: + extract(src_file_path=src_file_path, dst_dir_path=dst_dir_path, skip_if_exists=True) + assert_log_exists( + capture, "debug", "Skip file extraction, destination file exists", dst_file_path=dst_dir_path / "foo.txt" + ) + + +@patch("power_grid_model_io.utils.download.tqdm") +def test_extract__return_subdir_path(mock_tqdm: MagicMock, temp_dir: Path): + # Arrange + src_file_path = temp_dir / "foo.zip" + shutil.copyfile(ZIP1, src_file_path) + mock_tqdm.side_effect = MockTqdm + + # Act + extract_dir_path = extract(src_file_path=src_file_path) + + # Assert + assert extract_dir_path == temp_dir / "foo" / "foo" + assert (temp_dir / "foo" / "foo" / "foo.txt").is_file() From d923ae4667a1f26c9b98c2a2c12d4e461bd9795f Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Wed, 1 Mar 2023 15:14:02 +0100 Subject: [PATCH 21/50] Final unit test for extract() Signed-off-by: Bram Stoeller --- tests/unit/utils/test_zip_files.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/unit/utils/test_zip_files.py b/tests/unit/utils/test_zip_files.py index e6b79c6b..0a651624 100644 --- a/tests/unit/utils/test_zip_files.py +++ b/tests/unit/utils/test_zip_files.py @@ -63,6 +63,16 @@ def test_extract__invalid_file_extension(): extract(src_file_path=Path("/tmp/dir/tempfile.download")) +def test_extract__invalid_dst_dir(temp_dir: Path): + # Arrange + with open(temp_dir / "notadir.txt", "wb"): + pass + + # Act / Assert + with pytest.raises(NotADirectoryError, match=r"Destination dir .*notadir\.txt exists and is not a directory"): + extract(src_file_path=Path("file.zip"), dst_dir_path=temp_dir / "notadir.txt") + + @patch("power_grid_model_io.utils.download.tqdm") def test_extract__file_exists(mock_tqdm: MagicMock, temp_dir: Path): # Arrange From 3236985451c12062552da005623be0eeb0e9e4f8 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Wed, 1 Mar 2023 15:23:39 +0100 Subject: [PATCH 22/50] Add test .zip files Signed-off-by: Bram Stoeller --- tests/data/zip_files/foo-bar.zip | Bin 0 -> 247 bytes tests/data/zip_files/foo-bar.zip.license | 3 +++ tests/data/zip_files/foo.zip | Bin 0 -> 134 bytes tests/data/zip_files/foo.zip.license | 3 +++ 4 files changed, 6 insertions(+) create mode 100644 tests/data/zip_files/foo-bar.zip create mode 100644 tests/data/zip_files/foo-bar.zip.license create mode 100644 tests/data/zip_files/foo.zip create mode 100644 tests/data/zip_files/foo.zip.license diff --git a/tests/data/zip_files/foo-bar.zip b/tests/data/zip_files/foo-bar.zip new file mode 100644 index 0000000000000000000000000000000000000000..5f7c0598608bcbeb5f92ef6fa083c84039038b06 GIT binary patch literal 247 zcmWIWW@Zs#00FPy#IOdj7;`}&8-)3QI4wUXCACODDX~beq@qMmPcJ1uC%;IcII~0{ zF*mg&0It~usCgfAMm|Ur2(yDV=Yw?lU~k@aG;5Tpa5*A}c7Y<++?D;r3H2?z^;bODIN0079*E2jVe literal 0 HcmV?d00001 diff --git a/tests/data/zip_files/foo-bar.zip.license b/tests/data/zip_files/foo-bar.zip.license new file mode 100644 index 00000000..e6220e93 --- /dev/null +++ b/tests/data/zip_files/foo-bar.zip.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project + +SPDX-License-Identifier: MPL-2.0 \ No newline at end of file diff --git a/tests/data/zip_files/foo.zip b/tests/data/zip_files/foo.zip new file mode 100644 index 0000000000000000000000000000000000000000..8e13ab608434d4387c6fe15626003c35f659f68f GIT binary patch literal 134 zcmWIWW@Zs#00Eca#ISwL8ToubHVAVAaaw-9J`n4bRFwGS7p3MZWEK>c=IZI`1$Z+u ji7?>S2-M5K$e;kCfB?yu0B=?{kPssfS^{Zv5QhN(qP!L{ literal 0 HcmV?d00001 diff --git a/tests/data/zip_files/foo.zip.license b/tests/data/zip_files/foo.zip.license new file mode 100644 index 00000000..e6220e93 --- /dev/null +++ b/tests/data/zip_files/foo.zip.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project + +SPDX-License-Identifier: MPL-2.0 \ No newline at end of file From 94c29631cebbbeb890602a8a35b3d3ac21969c4e Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Wed, 1 Mar 2023 15:28:09 +0100 Subject: [PATCH 23/50] The project is called Power Grid Model Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 2 +- src/power_grid_model_io/utils/zip_files.py | 2 +- tests/data/zip_files/foo-bar.zip.license | 2 +- tests/data/zip_files/foo.zip.license | 2 +- tests/unit/utils/test_download.py | 2 +- tests/unit/utils/test_zip_files.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index 28912903..6e5db493 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project +# SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model project # # SPDX-License-Identifier: MPL-2.0 """ diff --git a/src/power_grid_model_io/utils/zip_files.py b/src/power_grid_model_io/utils/zip_files.py index 6d5c45a5..cf001a83 100644 --- a/src/power_grid_model_io/utils/zip_files.py +++ b/src/power_grid_model_io/utils/zip_files.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project +# SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model project # # SPDX-License-Identifier: MPL-2.0 """ diff --git a/tests/data/zip_files/foo-bar.zip.license b/tests/data/zip_files/foo-bar.zip.license index e6220e93..ebe16ce8 100644 --- a/tests/data/zip_files/foo-bar.zip.license +++ b/tests/data/zip_files/foo-bar.zip.license @@ -1,3 +1,3 @@ -SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project +SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model project SPDX-License-Identifier: MPL-2.0 \ No newline at end of file diff --git a/tests/data/zip_files/foo.zip.license b/tests/data/zip_files/foo.zip.license index e6220e93..ebe16ce8 100644 --- a/tests/data/zip_files/foo.zip.license +++ b/tests/data/zip_files/foo.zip.license @@ -1,3 +1,3 @@ -SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project +SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model project SPDX-License-Identifier: MPL-2.0 \ No newline at end of file diff --git a/tests/unit/utils/test_download.py b/tests/unit/utils/test_download.py index c4593aa7..ecca974a 100644 --- a/tests/unit/utils/test_download.py +++ b/tests/unit/utils/test_download.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project +# SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model project # # SPDX-License-Identifier: MPL-2.0 import tempfile diff --git a/tests/unit/utils/test_zip_files.py b/tests/unit/utils/test_zip_files.py index 0a651624..5f745e3a 100644 --- a/tests/unit/utils/test_zip_files.py +++ b/tests/unit/utils/test_zip_files.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project +# SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model project # # SPDX-License-Identifier: MPL-2.0 import shutil From 8b90603388236fbd77d86f411b7337f5a51ecfb6 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Wed, 1 Mar 2023 16:10:33 +0100 Subject: [PATCH 24/50] Resolve temp dir in unit tests Signed-off-by: Bram Stoeller --- tests/unit/utils/test_download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/utils/test_download.py b/tests/unit/utils/test_download.py index ecca974a..a52e3f3e 100644 --- a/tests/unit/utils/test_download.py +++ b/tests/unit/utils/test_download.py @@ -26,7 +26,7 @@ @pytest.fixture() def temp_dir(): with tempfile.TemporaryDirectory() as tmp: - yield Path(tmp) + yield Path(tmp).resolve() def make_file(file_path: Path, file_size: int = 0): From dc73d0e2a015e09191b2583ee0b50599f6739383 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Wed, 1 Mar 2023 16:18:20 +0100 Subject: [PATCH 25/50] Resolve temp dir in unit tests Signed-off-by: Bram Stoeller --- tests/unit/utils/test_download.py | 2 +- tests/unit/utils/test_zip_files.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/utils/test_download.py b/tests/unit/utils/test_download.py index a52e3f3e..7e886703 100644 --- a/tests/unit/utils/test_download.py +++ b/tests/unit/utils/test_download.py @@ -367,7 +367,7 @@ def test_get_download_path__auto_dir(): path = get_download_path(file_name="file_name.zip") # Assert - assert path == Path(tempfile.gettempdir()).absolute() / "file_name.zip" + assert path == Path(tempfile.gettempdir()).resolve() / "file_name.zip" def test_get_download_path__auto_file_name(temp_dir: Path): diff --git a/tests/unit/utils/test_zip_files.py b/tests/unit/utils/test_zip_files.py index 5f745e3a..7c9d7e35 100644 --- a/tests/unit/utils/test_zip_files.py +++ b/tests/unit/utils/test_zip_files.py @@ -21,7 +21,7 @@ @pytest.fixture() def temp_dir(): with tempfile.TemporaryDirectory() as tmp: - yield Path(tmp) + yield Path(tmp).resolve() @patch("power_grid_model_io.utils.download.tqdm") From e3525239f6d103ce522d715c1fd00a7efac9f444 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Wed, 12 Oct 2022 16:30:58 +0200 Subject: [PATCH 26/50] Implement a download utility Signed-off-by: Bram Stoeller --- pyproject.toml | 1 + src/power_grid_model_io/utils/download.py | 132 ++++++++++++++++++++++ 2 files changed, 133 insertions(+) create mode 100644 src/power_grid_model_io/utils/download.py diff --git a/pyproject.toml b/pyproject.toml index f6e04ad7..d348bf3c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ dependencies = [ "power_grid_model>=1.4", "pyyaml", "structlog", + "tqdm", ] dynamic = ["version"] diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py new file mode 100644 index 00000000..29873e66 --- /dev/null +++ b/src/power_grid_model_io/utils/download.py @@ -0,0 +1,132 @@ +# SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project +# +# SPDX-License-Identifier: MPL-2.0 +""" +Helper functions to download (and store) files from the internet +""" + +import hashlib +from pathlib import Path +from tempfile import gettempdir +from typing import ByteString, Callable, Optional +from urllib import request + +import structlog +import tqdm + + +def download( + url: str, file_path: Optional[Path] = None, dir_path: Optional[Path] = None, overwrite: bool = False +) -> Path: + """ + Download a file from a URL and store it locally + + Args: + url: The url to the file + file_path: An optional file path (absolute or relative to dir_path). If no file_path is given, a file name is + generated based on the url + dir_path: An optional dir path to store the downloaded file. If no dir_path is given the system's temp dir will + be used. Use Path(".") if you intend to use the current working directory. + overwrite: Should we download the file, even if we have downloaded already (and the file size still matches)? + + Returns: + The path to the downloaded file + """ + log = structlog.get_logger(__name__) + file_path = get_download_path(dir_path=dir_path, file_path=file_path, data=url.encode()) + + if file_path.is_file(): + if overwrite: + log.debug("Forced re-downloading existing file", url=url, file_path=file_path) + else: + current_size = get_url_size(url=url) + previous_size = file_path.stat().st_size + if previous_size == current_size: + log.debug("Skip downloading existing file", url=url, file_path=file_path) + return file_path + log.debug( + "Re-downloading existing file, because the size has changed", + url=url, + file_path=file_path, + previous_size=previous_size, + current_size=current_size, + ) + else: + log.debug("Downloading file", url=url, file_path=file_path) + + def progress_hook(progress_bar: tqdm.tqdm) -> Callable[[int, int, int], None]: + last_block = [0] + + def update_progress_bar(block_num: int, block_size: int, file_size: int) -> None: + if file_size > 0: + progress_bar.total = file_size + progress_bar.update((block_num - last_block[0]) * block_size) + last_block[0] = block_num + + return update_progress_bar + + # Download to a temp file first, so the results are not stored if the transfer fails + with tqdm.tqdm(unit="B", unit_scale=True, desc=url, leave=True) as progress_bar: + temp_file, _headers = request.urlretrieve(url, reporthook=progress_hook(progress_bar)) + + # Check if the file contains any content + temp_path = Path(temp_file) + if temp_path.stat().st_size == 0: + log.warning("Downloaded an empty file", url=url, file_path=file_path) + + # Move the file to it's final destination + file_path.unlink(missing_ok=True) + temp_path.rename(file_path) + log.debug("Downloaded file", url=url, file_path=file_path, file_size=file_path.stat().st_size) + + return file_path + + +def get_url_size(url: str) -> int: + """ + Retrieve the file size of a given URL (based on it's header) + + Args: + url: The url to the file + + Return: + The file size in bytes + """ + return int(request.urlopen(url).headers["Content-Length"]) + + +def get_download_path(dir_path: Optional[Path], file_path: Optional[Path], data=Optional[ByteString]) -> Path: + """ + Determine the file path based on dir_path, file_path and/or data + + Args: + dir_path: An optional dir path to store the downloaded file. If no dir_path is given the system's temp dir will + be used. Use Path(".") if you intend to use the current working directory. + file_path: An optional file path (absolute or relative to dir_path). If no file_path is given, a file name is + generated based on the url + data: A bytestring that can be used to generate a filename. + """ + # If no file_path is given, generate a file name + if file_path is None: + if data is None: + raise ValueError(f"Can't auto generate a file name for a {type(data).__name__}.") + try: + md5 = hashlib.md5() + md5.update(data) + hash_str = md5.hexdigest() + except (TypeError, ValueError) as ex: + raise ValueError(f"Can't auto generate a file name for a {type(data).__name__}.") from ex + file_path = Path(f"{__name__}.{hash_str}") + + # If no dir_path is given, use the system's temp dir + if dir_path is None: + dir_path = Path(gettempdir()) + + # Prefix the file_path and allow special path characters like ~ + file_path = dir_path.expanduser() / file_path + + # If the file_path exists, it should be a file (not a dir) + if file_path.exists() and not file_path.is_file(): + raise ValueError(f"Invalid file path: {file_path}") + + return file_path.absolute() From 440610fddb54b794e66073a8edadac7cc32b47ab Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Fri, 14 Oct 2022 11:03:10 +0200 Subject: [PATCH 27/50] Add download_and_extract() Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 135 +++++++++++++++++----- 1 file changed, 106 insertions(+), 29 deletions(-) diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index 29873e66..2278ea3a 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -6,15 +6,41 @@ """ import hashlib +import re +import shutil +import zipfile from pathlib import Path -from tempfile import gettempdir -from typing import ByteString, Callable, Optional +from typing import ByteString, Callable, Optional, Tuple from urllib import request import structlog import tqdm +def download_and_extract( + url: str, file_path: Optional[Path] = None, dir_path: Optional[Path] = None, overwrite: bool = False +) -> Path: + """ + Download a file from a URL and store it locally, extract the contents and return the path to the contents. + + Args: + url: The url to the .zip file + file_path: An optional file path (absolute or relative to dir_path). If no file_path is given, a file name is + generated based on the url + dir_path: An optional dir path to store the downloaded file. If no dir_path is given the current working dir + will be used. + overwrite: Should we download the file, even if we have downloaded already (and the file size still matches)? + + Returns: + The path to the downloaded file + """ + src_file_path = download(url=url, file_path=file_path, dir_path=dir_path, overwrite=overwrite) + dst_dir_path = src_file_path.with_suffix("") + if overwrite and dst_dir_path.is_dir(): + shutil.rmtree(dst_dir_path) + return extract(src_file_path=src_file_path, dst_dir_path=dst_dir_path, skip_if_exists=not overwrite) + + def download( url: str, file_path: Optional[Path] = None, dir_path: Optional[Path] = None, overwrite: bool = False ) -> Path: @@ -25,34 +51,35 @@ def download( url: The url to the file file_path: An optional file path (absolute or relative to dir_path). If no file_path is given, a file name is generated based on the url - dir_path: An optional dir path to store the downloaded file. If no dir_path is given the system's temp dir will - be used. Use Path(".") if you intend to use the current working directory. + dir_path: An optional dir path to store the downloaded file. If no dir_path is given the current working dir + will be used. overwrite: Should we download the file, even if we have downloaded already (and the file size still matches)? Returns: The path to the downloaded file """ - log = structlog.get_logger(__name__) + remote_size, remote_file_name = get_url_size_and_file_name(url=url) + if file_path is None and remote_file_name: + file_path = Path(remote_file_name) + file_path = get_download_path(dir_path=dir_path, file_path=file_path, data=url.encode()) + log = structlog.get_logger(__name__).bind(url=url, file_path=file_path) if file_path.is_file(): if overwrite: - log.debug("Forced re-downloading existing file", url=url, file_path=file_path) + log.debug("Forced re-downloading existing file") else: - current_size = get_url_size(url=url) - previous_size = file_path.stat().st_size - if previous_size == current_size: - log.debug("Skip downloading existing file", url=url, file_path=file_path) + local_size = file_path.stat().st_size + if local_size == remote_size: + log.debug("Skip downloading existing file") return file_path log.debug( "Re-downloading existing file, because the size has changed", - url=url, - file_path=file_path, - previous_size=previous_size, - current_size=current_size, + local_size=local_size, + remote_size=remote_size, ) else: - log.debug("Downloading file", url=url, file_path=file_path) + log.debug("Downloading file") def progress_hook(progress_bar: tqdm.tqdm) -> Callable[[int, int, int], None]: last_block = [0] @@ -66,23 +93,24 @@ def update_progress_bar(block_num: int, block_size: int, file_size: int) -> None return update_progress_bar # Download to a temp file first, so the results are not stored if the transfer fails - with tqdm.tqdm(unit="B", unit_scale=True, desc=url, leave=True) as progress_bar: + with tqdm.tqdm(desc=url, unit="B", unit_scale=True, leave=True) as progress_bar: temp_file, _headers = request.urlretrieve(url, reporthook=progress_hook(progress_bar)) # Check if the file contains any content temp_path = Path(temp_file) if temp_path.stat().st_size == 0: - log.warning("Downloaded an empty file", url=url, file_path=file_path) + log.warning("Downloaded an empty file") # Move the file to it's final destination + file_path.parent.mkdir(parents=True, exist_ok=True) file_path.unlink(missing_ok=True) temp_path.rename(file_path) - log.debug("Downloaded file", url=url, file_path=file_path, file_size=file_path.stat().st_size) + log.debug("Downloaded file", file_size=file_path.stat().st_size) return file_path -def get_url_size(url: str) -> int: +def get_url_size_and_file_name(url: str) -> Tuple[int, str]: """ Retrieve the file size of a given URL (based on it's header) @@ -92,7 +120,13 @@ def get_url_size(url: str) -> int: Return: The file size in bytes """ - return int(request.urlopen(url).headers["Content-Length"]) + with request.urlopen(url) as context: + headers = context.headers + file_size = int(headers.get("Content-Length", 0)) + matches = re.findall(r"filename=\"(.+)\"", headers.get("Content-Disposition", "")) + file_name = matches[0] if matches else None + + return file_size, file_name def get_download_path(dir_path: Optional[Path], file_path: Optional[Path], data=Optional[ByteString]) -> Path: @@ -106,27 +140,70 @@ def get_download_path(dir_path: Optional[Path], file_path: Optional[Path], data= generated based on the url data: A bytestring that can be used to generate a filename. """ + # If no file_path is given, generate a file name if file_path is None: - if data is None: - raise ValueError(f"Can't auto generate a file name for a {type(data).__name__}.") try: md5 = hashlib.md5() md5.update(data) hash_str = md5.hexdigest() except (TypeError, ValueError) as ex: raise ValueError(f"Can't auto generate a file name for a {type(data).__name__}.") from ex - file_path = Path(f"{__name__}.{hash_str}") + file_path = Path(__name__) / f"{hash_str}.download" - # If no dir_path is given, use the system's temp dir - if dir_path is None: - dir_path = Path(gettempdir()) + # If no dir_path is given, use current working dir + elif dir_path is None: + dir_path = Path(".") - # Prefix the file_path and allow special path characters like ~ - file_path = dir_path.expanduser() / file_path + # Combine the two paths + file_path = (dir_path / file_path).resolve() if dir_path else file_path.resolve() # If the file_path exists, it should be a file (not a dir) if file_path.exists() and not file_path.is_file(): raise ValueError(f"Invalid file path: {file_path}") - return file_path.absolute() + return file_path + + +def extract(src_file_path: Path, dst_dir_path: Optional[Path] = None, skip_if_exists=True) -> Path: + """ + Extract a .zip file and return the destination dir + + Args: + src_file_path: The .zip file to extract. + src_file_path: An optional destination path. If none is given, the src_file_path wihout .zip extension is used. + skip_if_exists: If true, it returns the dir path, otherwise raise an exception. + + Returns: The path where the files are extracted + + """ + if src_file_path.suffix.lower() != ".zip": + raise ValueError(f"Only files with .zip extension are supported, got {src_file_path}") + if dst_dir_path is None: + dst_dir_path = src_file_path.with_suffix("") + + log = structlog.get_logger(__name__).bind(src_file_path=src_file_path, dst_dir_path=dst_dir_path) + + if dst_dir_path.exists(): + if not dst_dir_path.is_dir(): + raise NotADirectoryError(f"Destination dir {dst_dir_path} exists and is not a directory") + if not skip_if_exists: + raise FileExistsError(f"Destination dir {dst_dir_path} exists and is not empty") + log.debug("Skip extraction, destiation dir exists") + return dst_dir_path + + # Create the destination directory + dst_dir_path.mkdir(parents=True, exist_ok=True) + + # Extract per file, so we can show a progress bar + with zipfile.ZipFile(src_file_path, "r") as zip_file: + file_list = zip_file.namelist() + for file_path in tqdm.tqdm(desc=str(src_file_path), iterable=file_list, total=len(file_list), unit="file"): + zip_file.extract(member=file_path, path=dst_dir_path) + + # If the zip files contains a single directory with the same name as the zip file, return that dir + contents = list(dst_dir_path.iterdir()) + if len(contents) == 1 and contents[0].is_dir() and contents[0].name == src_file_path.stem: + dst_dir_path = contents[0] + + return dst_dir_path From a9ef0a9edbe7b242e5898004e36cf11b5b9e5cba Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Fri, 14 Oct 2022 19:40:51 +0200 Subject: [PATCH 28/50] Fix path bug for existing dirs Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 24 +++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index 2278ea3a..b637c0d9 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -14,7 +14,7 @@ from urllib import request import structlog -import tqdm +from tqdm import tqdm def download_and_extract( @@ -81,7 +81,7 @@ def download( else: log.debug("Downloading file") - def progress_hook(progress_bar: tqdm.tqdm) -> Callable[[int, int, int], None]: + def progress_hook(progress_bar: tqdm) -> Callable[[int, int, int], None]: last_block = [0] def update_progress_bar(block_num: int, block_size: int, file_size: int) -> None: @@ -93,7 +93,7 @@ def update_progress_bar(block_num: int, block_size: int, file_size: int) -> None return update_progress_bar # Download to a temp file first, so the results are not stored if the transfer fails - with tqdm.tqdm(desc=url, unit="B", unit_scale=True, leave=True) as progress_bar: + with tqdm(desc="Downloading", unit="B", unit_scale=True, leave=True) as progress_bar: temp_file, _headers = request.urlretrieve(url, reporthook=progress_hook(progress_bar)) # Check if the file contains any content @@ -190,16 +190,16 @@ def extract(src_file_path: Path, dst_dir_path: Optional[Path] = None, skip_if_ex if not skip_if_exists: raise FileExistsError(f"Destination dir {dst_dir_path} exists and is not empty") log.debug("Skip extraction, destiation dir exists") - return dst_dir_path - # Create the destination directory - dst_dir_path.mkdir(parents=True, exist_ok=True) - - # Extract per file, so we can show a progress bar - with zipfile.ZipFile(src_file_path, "r") as zip_file: - file_list = zip_file.namelist() - for file_path in tqdm.tqdm(desc=str(src_file_path), iterable=file_list, total=len(file_list), unit="file"): - zip_file.extract(member=file_path, path=dst_dir_path) + else: + # Create the destination directory + dst_dir_path.mkdir(parents=True, exist_ok=True) + + # Extract per file, so we can show a progress bar + with zipfile.ZipFile(src_file_path, "r") as zip_file: + file_list = zip_file.namelist() + for file_path in tqdm(desc="Extracting", iterable=file_list, total=len(file_list), unit="file", leave=True): + zip_file.extract(member=file_path, path=dst_dir_path) # If the zip files contains a single directory with the same name as the zip file, return that dir contents = list(dst_dir_path.iterdir()) From 32574d29b35840d3834eead0dcbc04662985fac4 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Fri, 14 Oct 2022 20:04:46 +0200 Subject: [PATCH 29/50] Switch dir_path and file_path in download_and_extract Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index b637c0d9..cb115740 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -18,17 +18,17 @@ def download_and_extract( - url: str, file_path: Optional[Path] = None, dir_path: Optional[Path] = None, overwrite: bool = False + url: str, dir_path: Optional[Path] = None, file_path: Optional[Path] = None, overwrite: bool = False ) -> Path: """ Download a file from a URL and store it locally, extract the contents and return the path to the contents. Args: url: The url to the .zip file - file_path: An optional file path (absolute or relative to dir_path). If no file_path is given, a file name is - generated based on the url dir_path: An optional dir path to store the downloaded file. If no dir_path is given the current working dir will be used. + file_path: An optional file path (absolute or relative to dir_path). If no file_path is given, a file name is + generated based on the url overwrite: Should we download the file, even if we have downloaded already (and the file size still matches)? Returns: From 29e1c92f5bf2ecb7d2947379c2d4ba6010d12552 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Fri, 14 Oct 2022 22:05:26 +0200 Subject: [PATCH 30/50] Fix typo Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index cb115740..fa9550d7 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -189,7 +189,7 @@ def extract(src_file_path: Path, dst_dir_path: Optional[Path] = None, skip_if_ex raise NotADirectoryError(f"Destination dir {dst_dir_path} exists and is not a directory") if not skip_if_exists: raise FileExistsError(f"Destination dir {dst_dir_path} exists and is not empty") - log.debug("Skip extraction, destiation dir exists") + log.debug("Skip extraction, destination dir exists") else: # Create the destination directory From 2ae07e01ac79a4a7102aab65a3e1cf45db49b6b3 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Fri, 14 Oct 2022 22:29:40 +0200 Subject: [PATCH 31/50] Get and check http status code Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index fa9550d7..e36ae70c 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -58,7 +58,10 @@ def download( Returns: The path to the downloaded file """ - remote_size, remote_file_name = get_url_size_and_file_name(url=url) + status_code, remote_size, remote_file_name = get_url_headers(url=url) + if status_code != 200: + raise IOError(f"Could not download from URL, status={status_code}") + if file_path is None and remote_file_name: file_path = Path(remote_file_name) @@ -110,7 +113,7 @@ def update_progress_bar(block_num: int, block_size: int, file_size: int) -> None return file_path -def get_url_size_and_file_name(url: str) -> Tuple[int, str]: +def get_url_headers(url: str) -> Tuple[int, int, str]: """ Retrieve the file size of a given URL (based on it's header) @@ -121,12 +124,13 @@ def get_url_size_and_file_name(url: str) -> Tuple[int, str]: The file size in bytes """ with request.urlopen(url) as context: + status_code = context.status headers = context.headers file_size = int(headers.get("Content-Length", 0)) matches = re.findall(r"filename=\"(.+)\"", headers.get("Content-Disposition", "")) file_name = matches[0] if matches else None - return file_size, file_name + return status_code, file_size, file_name def get_download_path(dir_path: Optional[Path], file_path: Optional[Path], data=Optional[ByteString]) -> Path: From c39c565d130f37449cf58fbbb45bb62fb1f1efcb Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Wed, 26 Oct 2022 11:54:01 +0200 Subject: [PATCH 32/50] Use sha1 hashes instead of md5 Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index e36ae70c..1381e64c 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -148,9 +148,9 @@ def get_download_path(dir_path: Optional[Path], file_path: Optional[Path], data= # If no file_path is given, generate a file name if file_path is None: try: - md5 = hashlib.md5() - md5.update(data) - hash_str = md5.hexdigest() + sha1 = hashlib.sha1() + sha1.update(data) + hash_str = sha1.hexdigest() except (TypeError, ValueError) as ex: raise ValueError(f"Can't auto generate a file name for a {type(data).__name__}.") from ex file_path = Path(__name__) / f"{hash_str}.download" From 01895fe14ca35a054f147739f9dc24911482647b Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Wed, 26 Oct 2022 11:57:58 +0200 Subject: [PATCH 33/50] Use sha256 hashes instead of sha1 Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index 1381e64c..99ce500f 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -148,9 +148,9 @@ def get_download_path(dir_path: Optional[Path], file_path: Optional[Path], data= # If no file_path is given, generate a file name if file_path is None: try: - sha1 = hashlib.sha1() - sha1.update(data) - hash_str = sha1.hexdigest() + sha256 = hashlib.sha256() + sha256.update(data) + hash_str = sha256.hexdigest() except (TypeError, ValueError) as ex: raise ValueError(f"Can't auto generate a file name for a {type(data).__name__}.") from ex file_path = Path(__name__) / f"{hash_str}.download" From e7988f1b00018d5b0c0559d5149c8144fa616fd8 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Thu, 23 Feb 2023 13:55:43 +0100 Subject: [PATCH 34/50] Create DownloadProgressHook class + unit tests Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 51 ++++++++++++++------ tests/unit/utils/test_download.py | 57 +++++++++++++++++++++++ 2 files changed, 95 insertions(+), 13 deletions(-) create mode 100644 tests/unit/utils/test_download.py diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index 99ce500f..ca051221 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -10,13 +10,42 @@ import shutil import zipfile from pathlib import Path -from typing import ByteString, Callable, Optional, Tuple +from typing import ByteString, Optional, Tuple from urllib import request import structlog from tqdm import tqdm +class DownloadProgressHook: # pylint: disable=too-few-public-methods + """ + Report hook for request.urlretrieve() to update a progress bar based on the amount of downloaded blocks + """ + + def __init__(self, progress_bar: tqdm): + """ + Report hook for request.urlretrieve() to update a progress bar based on the amount of downloaded blocks + + Args: + progress_bar: A tqdb progress bar + """ + self._progress_bar = progress_bar + self._last_block = 0 + + def __call__(self, block_num: int, block_size: int, file_size: int) -> None: + """ + Args: + block_num: The last downloaded block number + block_size: The block size in bytes + file_size: The file size in bytes (may be 0 in the first call) + + """ + if file_size > 0: + self._progress_bar.total = file_size + self._progress_bar.update((block_num - self._last_block) * block_size) + self._last_block = block_num + + def download_and_extract( url: str, dir_path: Optional[Path] = None, file_path: Optional[Path] = None, overwrite: bool = False ) -> Path: @@ -30,14 +59,21 @@ def download_and_extract( file_path: An optional file path (absolute or relative to dir_path). If no file_path is given, a file name is generated based on the url overwrite: Should we download the file, even if we have downloaded already (and the file size still matches)? + Be careful with this option, as it will remove files from your drive irreversibly! Returns: The path to the downloaded file """ + + # Download the file and use the file name as the base name for the extraction directory src_file_path = download(url=url, file_path=file_path, dir_path=dir_path, overwrite=overwrite) dst_dir_path = src_file_path.with_suffix("") + + # If we explicitly want to overwrite the extracted files, remove the if overwrite and dst_dir_path.is_dir(): shutil.rmtree(dst_dir_path) + + # Extract the files and return the path of the extraction directory return extract(src_file_path=src_file_path, dst_dir_path=dst_dir_path, skip_if_exists=not overwrite) @@ -84,20 +120,9 @@ def download( else: log.debug("Downloading file") - def progress_hook(progress_bar: tqdm) -> Callable[[int, int, int], None]: - last_block = [0] - - def update_progress_bar(block_num: int, block_size: int, file_size: int) -> None: - if file_size > 0: - progress_bar.total = file_size - progress_bar.update((block_num - last_block[0]) * block_size) - last_block[0] = block_num - - return update_progress_bar - # Download to a temp file first, so the results are not stored if the transfer fails with tqdm(desc="Downloading", unit="B", unit_scale=True, leave=True) as progress_bar: - temp_file, _headers = request.urlretrieve(url, reporthook=progress_hook(progress_bar)) + temp_file, _headers = request.urlretrieve(url, reporthook=DownloadProgressHook(progress_bar)) # Check if the file contains any content temp_path = Path(temp_file) diff --git a/tests/unit/utils/test_download.py b/tests/unit/utils/test_download.py new file mode 100644 index 00000000..a35bd707 --- /dev/null +++ b/tests/unit/utils/test_download.py @@ -0,0 +1,57 @@ +# SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project +# +# SPDX-License-Identifier: MPL-2.0 +from unittest.mock import MagicMock + +from power_grid_model_io.utils.download import ( + DownloadProgressHook, + download, + download_and_extract, + extract, + get_download_path, + get_url_headers, +) + + +def test_progress_hook(): + # Arrange + progress_bar = MagicMock() + progress_bar.total = None + hook = DownloadProgressHook(progress_bar) + + # Act + hook(2, 10, 0) + + # Assert + assert progress_bar.total is None # total is not updated + progress_bar.update.assert_called_once_with((2 - 0) * 10) + + # Arrange + progress_bar.reset_mock() + + # Act + hook(5, 10, 123) + + # Assert + assert progress_bar.total == 123 # total is updated + progress_bar.update.assert_called_once_with((5 - 2) * 10) + + +def test_download_and_extract(): + download_and_extract + + +def test_download(): + download + + +def test_get_url_headers(): + get_url_headers + + +def test_get_download_path(): + get_download_path + + +def test_extract(): + extract From 3396563590db8497b7499b4706916c7776692b63 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Thu, 23 Feb 2023 14:41:48 +0100 Subject: [PATCH 35/50] Unit tests for download_and_extract() Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 4 +- tests/unit/utils/test_download.py | 90 ++++++++++++++++++++--- 2 files changed, 82 insertions(+), 12 deletions(-) diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index ca051221..80a0d33a 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -7,9 +7,9 @@ import hashlib import re -import shutil import zipfile from pathlib import Path +from shutil import rmtree as remove_dir from typing import ByteString, Optional, Tuple from urllib import request @@ -71,7 +71,7 @@ def download_and_extract( # If we explicitly want to overwrite the extracted files, remove the if overwrite and dst_dir_path.is_dir(): - shutil.rmtree(dst_dir_path) + remove_dir(dst_dir_path) # Extract the files and return the path of the extraction directory return extract(src_file_path=src_file_path, dst_dir_path=dst_dir_path, skip_if_exists=not overwrite) diff --git a/tests/unit/utils/test_download.py b/tests/unit/utils/test_download.py index a35bd707..f6b430a1 100644 --- a/tests/unit/utils/test_download.py +++ b/tests/unit/utils/test_download.py @@ -1,7 +1,8 @@ # SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project # # SPDX-License-Identifier: MPL-2.0 -from unittest.mock import MagicMock +from pathlib import Path +from unittest.mock import MagicMock, patch from power_grid_model_io.utils.download import ( DownloadProgressHook, @@ -19,26 +20,95 @@ def test_progress_hook(): progress_bar.total = None hook = DownloadProgressHook(progress_bar) - # Act + # Act (block_num, block_size, file_size) hook(2, 10, 0) + assert progress_bar.total is None # total is not updated + + hook(3, 10, 123) + assert progress_bar.total == 123 # total is updated + + hook(6, 10, 123) # Assert - assert progress_bar.total is None # total is not updated - progress_bar.update.assert_called_once_with((2 - 0) * 10) + assert progress_bar.update.call_args_list[0].args == (20,) + assert progress_bar.update.call_args_list[1].args == (10,) + assert progress_bar.update.call_args_list[2].args == (30,) + +@patch("power_grid_model_io.utils.download.extract") +@patch("power_grid_model_io.utils.download.download") +def test_download_and_extract__paths(mock_download: MagicMock, mock_extract: MagicMock): # Arrange - progress_bar.reset_mock() + url = "https://www.source.com/remote.zip" + dir_path = Path("/tmp/dir-path/") + file_path = Path("file.zip") + src_file_path = Path("/tmp/dir-path/local.zip") + dst_dir_path = Path("/tmp/dir-path/local") + extract_dir_path = Path("/tmp/extract-path/local") # in practice this would be the same as dst_dir_path + + mock_download.return_value = src_file_path + mock_extract.return_value = extract_dir_path # Act - hook(5, 10, 123) + result = download_and_extract(url=url, dir_path=dir_path, file_path=file_path) # Assert - assert progress_bar.total == 123 # total is updated - progress_bar.update.assert_called_once_with((5 - 2) * 10) + mock_download.assert_called_once_with(url=url, file_path=file_path, dir_path=dir_path, overwrite=False) + mock_extract.assert_called_once_with(src_file_path=src_file_path, dst_dir_path=dst_dir_path, skip_if_exists=True) + assert result == extract_dir_path -def test_download_and_extract(): - download_and_extract +@patch("power_grid_model_io.utils.download.extract") +@patch("power_grid_model_io.utils.download.download") +def test_download_and_extract__no_paths(mock_download: MagicMock, mock_extract: MagicMock): + # Arrange + url = "https://www.source.com/remote.zip" + src_file_path = Path("current-working-dir/remote.zip") + dst_dir_path = Path("current-working-dir/remote") + extract_dir_path = Path("current-working-dir/extract") # in practice this would be the same as dst_dir_path + + mock_download.return_value = src_file_path + mock_extract.return_value = extract_dir_path + + # Act + result = download_and_extract(url=url) + + # Assert + mock_download.assert_called_once_with(url=url, file_path=None, dir_path=None, overwrite=False) + mock_extract.assert_called_once_with(src_file_path=src_file_path, dst_dir_path=dst_dir_path, skip_if_exists=True) + assert result == extract_dir_path + + +@patch("power_grid_model_io.utils.download.extract", new=MagicMock) +@patch("power_grid_model_io.utils.download.remove_dir") +@patch("power_grid_model_io.utils.download.Path.is_dir") +@patch("power_grid_model_io.utils.download.download") +def test_download_and_extract__overwrite(mock_download: MagicMock, mock_is_dir: MagicMock, mock_remove_dir: MagicMock): + # Arrange + src_file_path = Path("current-working-dir/remote.zip") + dst_dir_path = Path("current-working-dir/remote") + + mock_download.return_value = src_file_path + + # Act / Assert (dir does not exist, overwrite = False) + mock_is_dir.return_value = False + download_and_extract(url=MagicMock(), overwrite=False) + mock_remove_dir.assert_not_called() + + # Act / Assert (dir does not exist, overwrite = True) + mock_is_dir.return_value = False + download_and_extract(url=MagicMock(), overwrite=True) + mock_remove_dir.assert_not_called() + + # Act / Assert (dir does exist, overwrite = False) + mock_is_dir.return_value = True + download_and_extract(url=MagicMock(), overwrite=False) + mock_remove_dir.assert_not_called() + + # Act / Assert (dir does exist, overwrite = True) + mock_is_dir.return_value = True + download_and_extract(url=MagicMock(), overwrite=True) + mock_remove_dir.assert_called_once_with(dst_dir_path) def test_download(): From b2ca298a6f0ddc96a29fa9fcf565d3b9805e7a51 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Thu, 23 Feb 2023 16:49:04 +0100 Subject: [PATCH 36/50] Less mocking more actual file handling (in a temp dir) Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 16 +- tests/unit/utils/test_download.py | 186 +++++++++++++++++----- 2 files changed, 159 insertions(+), 43 deletions(-) diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index 80a0d33a..42ab30b3 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -16,6 +16,8 @@ import structlog from tqdm import tqdm +_log = structlog.get_logger(__name__) + class DownloadProgressHook: # pylint: disable=too-few-public-methods """ @@ -102,11 +104,12 @@ def download( file_path = Path(remote_file_name) file_path = get_download_path(dir_path=dir_path, file_path=file_path, data=url.encode()) - log = structlog.get_logger(__name__).bind(url=url, file_path=file_path) + log = _log.bind(url=url, file_path=file_path) if file_path.is_file(): if overwrite: log.debug("Forced re-downloading existing file") + # Don't remove the existing file just yet... Let's first see if we can download a new version. else: local_size = file_path.stat().st_size if local_size == remote_size: @@ -122,23 +125,26 @@ def download( # Download to a temp file first, so the results are not stored if the transfer fails with tqdm(desc="Downloading", unit="B", unit_scale=True, leave=True) as progress_bar: - temp_file, _headers = request.urlretrieve(url, reporthook=DownloadProgressHook(progress_bar)) + report_hook = DownloadProgressHook(progress_bar) + temp_file, _headers = request.urlretrieve(url, reporthook=report_hook) # Check if the file contains any content temp_path = Path(temp_file) if temp_path.stat().st_size == 0: log.warning("Downloaded an empty file") + # Remove the file, if it already exists + file_path.unlink(missing_ok=True) + # Move the file to it's final destination file_path.parent.mkdir(parents=True, exist_ok=True) - file_path.unlink(missing_ok=True) temp_path.rename(file_path) log.debug("Downloaded file", file_size=file_path.stat().st_size) return file_path -def get_url_headers(url: str) -> Tuple[int, int, str]: +def get_url_headers(url: str) -> Tuple[int, int, Optional[str]]: """ Retrieve the file size of a given URL (based on it's header) @@ -211,7 +217,7 @@ def extract(src_file_path: Path, dst_dir_path: Optional[Path] = None, skip_if_ex if dst_dir_path is None: dst_dir_path = src_file_path.with_suffix("") - log = structlog.get_logger(__name__).bind(src_file_path=src_file_path, dst_dir_path=dst_dir_path) + log = _log.bind(src_file_path=src_file_path, dst_dir_path=dst_dir_path) if dst_dir_path.exists(): if not dst_dir_path.is_dir(): diff --git a/tests/unit/utils/test_download.py b/tests/unit/utils/test_download.py index f6b430a1..fd73ce3e 100644 --- a/tests/unit/utils/test_download.py +++ b/tests/unit/utils/test_download.py @@ -1,8 +1,12 @@ # SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project # # SPDX-License-Identifier: MPL-2.0 +import tempfile from pathlib import Path -from unittest.mock import MagicMock, patch +from unittest.mock import ANY, MagicMock, patch + +import pytest +import structlog.testing from power_grid_model_io.utils.download import ( DownloadProgressHook, @@ -13,6 +17,25 @@ get_url_headers, ) +from ...utils import assert_log_exists + +ZIP_SIZE = 1024 + + +@pytest.fixture() +def temp_dir(): + with tempfile.TemporaryDirectory() as tmp: + yield Path(tmp) + + +def make_file(file_name: Path, file_size: int = 0): + with open(file_name, "wb") as fp: + fp.write(b"\0" * file_size) + + +def make_dir(dirname: Path): + dirname.mkdir(parents=True) + def test_progress_hook(): # Arrange @@ -37,14 +60,14 @@ def test_progress_hook(): @patch("power_grid_model_io.utils.download.extract") @patch("power_grid_model_io.utils.download.download") -def test_download_and_extract__paths(mock_download: MagicMock, mock_extract: MagicMock): +def test_download_and_extract__paths(mock_download: MagicMock, mock_extract: MagicMock, temp_dir: Path): # Arrange - url = "https://www.source.com/remote.zip" - dir_path = Path("/tmp/dir-path/") - file_path = Path("file.zip") - src_file_path = Path("/tmp/dir-path/local.zip") - dst_dir_path = Path("/tmp/dir-path/local") - extract_dir_path = Path("/tmp/extract-path/local") # in practice this would be the same as dst_dir_path + url = MagicMock() + dir_path = MagicMock() + file_path = MagicMock() + src_file_path = temp_dir / "data.zip" + dst_dir_path = temp_dir / "data" + extract_dir_path = MagicMock() mock_download.return_value = src_file_path mock_extract.return_value = extract_dir_path @@ -60,59 +83,146 @@ def test_download_and_extract__paths(mock_download: MagicMock, mock_extract: Mag @patch("power_grid_model_io.utils.download.extract") @patch("power_grid_model_io.utils.download.download") -def test_download_and_extract__no_paths(mock_download: MagicMock, mock_extract: MagicMock): +def test_download_and_extract__no_paths(mock_download: MagicMock, mock_extract: MagicMock, temp_dir: Path): # Arrange - url = "https://www.source.com/remote.zip" - src_file_path = Path("current-working-dir/remote.zip") - dst_dir_path = Path("current-working-dir/remote") - extract_dir_path = Path("current-working-dir/extract") # in practice this would be the same as dst_dir_path + url = MagicMock() + src_file_path = temp_dir / "data.zip" + dst_dir_path = temp_dir / "data" mock_download.return_value = src_file_path - mock_extract.return_value = extract_dir_path # Act - result = download_and_extract(url=url) + download_and_extract(url=url) # Assert mock_download.assert_called_once_with(url=url, file_path=None, dir_path=None, overwrite=False) mock_extract.assert_called_once_with(src_file_path=src_file_path, dst_dir_path=dst_dir_path, skip_if_exists=True) - assert result == extract_dir_path @patch("power_grid_model_io.utils.download.extract", new=MagicMock) -@patch("power_grid_model_io.utils.download.remove_dir") -@patch("power_grid_model_io.utils.download.Path.is_dir") @patch("power_grid_model_io.utils.download.download") -def test_download_and_extract__overwrite(mock_download: MagicMock, mock_is_dir: MagicMock, mock_remove_dir: MagicMock): +def test_download_and_extract__overwrite(mock_download: MagicMock, temp_dir: Path): # Arrange - src_file_path = Path("current-working-dir/remote.zip") - dst_dir_path = Path("current-working-dir/remote") - + src_file_path = temp_dir / "data.zip" mock_download.return_value = src_file_path - # Act / Assert (dir does not exist, overwrite = False) - mock_is_dir.return_value = False - download_and_extract(url=MagicMock(), overwrite=False) - mock_remove_dir.assert_not_called() + dst_dir_path = temp_dir / "data" + make_dir(dst_dir_path) - # Act / Assert (dir does not exist, overwrite = True) - mock_is_dir.return_value = False - download_and_extract(url=MagicMock(), overwrite=True) - mock_remove_dir.assert_not_called() - - # Act / Assert (dir does exist, overwrite = False) - mock_is_dir.return_value = True + # Act / Assert download_and_extract(url=MagicMock(), overwrite=False) - mock_remove_dir.assert_not_called() + assert dst_dir_path.is_dir() # Act / Assert (dir does exist, overwrite = True) - mock_is_dir.return_value = True download_and_extract(url=MagicMock(), overwrite=True) - mock_remove_dir.assert_called_once_with(dst_dir_path) + assert not dst_dir_path.exists() + + +@patch("power_grid_model_io.utils.download.DownloadProgressHook") +@patch("power_grid_model_io.utils.download.request.urlretrieve") +@patch("power_grid_model_io.utils.download.tqdm") +@patch("power_grid_model_io.utils.download.get_download_path") +@patch("power_grid_model_io.utils.download.get_url_headers") +def test_download( + mock_url_headers: MagicMock, + mock_download_path: MagicMock, + mock_tqdm: MagicMock, + mock_urlretrieve: MagicMock, + mock_hook: MagicMock, + temp_dir: Path, +): + # Arrange + url = "https://www.source.com" + dir_path = temp_dir / "data" + file_path = temp_dir / "data.zip" + temp_file = temp_dir / "data.download" + download_path = temp_dir / "data.zip" + + def urlretrieve(*_args, **_kwargs): + make_file(temp_file, ZIP_SIZE) + return temp_file, None + + mock_url_headers.return_value = (200, ZIP_SIZE, "remote.zip") + mock_download_path.return_value = download_path + mock_urlretrieve.side_effect = urlretrieve + # Act / assert + with structlog.testing.capture_logs() as capture: + result = download(url=url, file_path=file_path, dir_path=dir_path) + assert_log_exists(capture, "debug", "Downloading file") + + # Assert + mock_download_path.assert_called_once_with(dir_path=dir_path, file_path=file_path, data=b"https://www.source.com") + mock_tqdm.assert_called_once() + mock_hook.assert_called_once_with(mock_tqdm.return_value.__enter__.return_value) + mock_urlretrieve.assert_called_once_with(url, reporthook=mock_hook.return_value) + assert result == file_path + assert result.is_file() + assert result.stat().st_size == ZIP_SIZE + + +@patch("power_grid_model_io.utils.download.DownloadProgressHook", new=MagicMock()) +@patch("power_grid_model_io.utils.download.request.urlretrieve") +@patch("power_grid_model_io.utils.download.tqdm", new=MagicMock()) +@patch("power_grid_model_io.utils.download.get_download_path") +@patch("power_grid_model_io.utils.download.get_url_headers") +def test_download__auto_file_name( + mock_url_headers: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path +): + # Arrange + temp_file = temp_dir / "data.download" + download_path = temp_dir / "data.zip" + + def urlretrieve(*_args, **_kwargs): + make_file(temp_file, 0) + return temp_file, None + + mock_url_headers.return_value = (200, 0, "remote.zip") + mock_download_path.return_value = download_path + mock_urlretrieve.side_effect = urlretrieve + + # Act + download(url=MagicMock()) + + # Assert + mock_download_path.assert_called_once_with(dir_path=None, file_path=Path("remote.zip"), data=ANY) + + +@patch("power_grid_model_io.utils.download.DownloadProgressHook", new=MagicMock()) +@patch("power_grid_model_io.utils.download.request.urlretrieve") +@patch("power_grid_model_io.utils.download.tqdm", new=MagicMock()) +@patch("power_grid_model_io.utils.download.get_download_path") +@patch("power_grid_model_io.utils.download.get_url_headers") +def test_download__empty_file( + mock_url_headers: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path +): + # Arrange + temp_file = temp_dir / "data.download" + download_path = temp_dir / "data.zip" + + def urlretrieve(*_args, **_kwargs): + make_file(temp_file, 0) + return temp_file, None + + mock_url_headers.return_value = (200, 0, "remote.zip") + mock_download_path.return_value = download_path + mock_urlretrieve.side_effect = urlretrieve + + # Act / Assert + with structlog.testing.capture_logs() as capture: + download(url=MagicMock()) + assert_log_exists(capture, "debug", "Downloading file") + assert_log_exists(capture, "warning", "Downloaded an empty file") + + +@patch("power_grid_model_io.utils.download.get_url_headers") +def test_download__status_error(mock_url_headers: MagicMock): + # Arrange + mock_url_headers.return_value = (403, 0, None) -def test_download(): - download + # Act / Assert + with pytest.raises(IOError, match=r"Could not download from URL, status=403"): + download(url=MagicMock()) def test_get_url_headers(): From 045d887a45e9e3f0ce3bab8c5c23ac8eeda5d9b6 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Thu, 23 Feb 2023 16:59:11 +0100 Subject: [PATCH 37/50] Unit tests for download() Signed-off-by: Bram Stoeller --- tests/unit/utils/test_download.py | 88 +++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/tests/unit/utils/test_download.py b/tests/unit/utils/test_download.py index fd73ce3e..bf5de783 100644 --- a/tests/unit/utils/test_download.py +++ b/tests/unit/utils/test_download.py @@ -215,6 +215,94 @@ def urlretrieve(*_args, **_kwargs): assert_log_exists(capture, "warning", "Downloaded an empty file") +@patch("power_grid_model_io.utils.download.DownloadProgressHook", new=MagicMock()) +@patch("power_grid_model_io.utils.download.request.urlretrieve") +@patch("power_grid_model_io.utils.download.tqdm", new=MagicMock()) +@patch("power_grid_model_io.utils.download.get_download_path") +@patch("power_grid_model_io.utils.download.get_url_headers") +def test_download__skip_existing_file( + mock_url_headers: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path +): + # Arrange + download_path = temp_dir / "data.zip" + make_file(download_path, ZIP_SIZE) + + mock_url_headers.return_value = (200, ZIP_SIZE, "remote.zip") + mock_download_path.return_value = download_path + + # Act + with structlog.testing.capture_logs() as capture: + download(url=MagicMock()) + assert_log_exists(capture, "debug", "Skip downloading existing file") + + # Assert + mock_urlretrieve.assert_not_called() + + +@patch("power_grid_model_io.utils.download.DownloadProgressHook", new=MagicMock()) +@patch("power_grid_model_io.utils.download.request.urlretrieve") +@patch("power_grid_model_io.utils.download.tqdm", new=MagicMock()) +@patch("power_grid_model_io.utils.download.get_download_path") +@patch("power_grid_model_io.utils.download.get_url_headers") +def test_download__update_file( + mock_url_headers: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path +): + # Arrange + temp_file = temp_dir / "data.download" + download_path = temp_dir / "data.zip" + make_file(download_path, ZIP_SIZE) + + def urlretrieve(*_args, **_kwargs): + make_file(temp_file, ZIP_SIZE + 1) + return temp_file, None + + mock_url_headers.return_value = (200, ZIP_SIZE + 1, "remote.zip") + mock_download_path.return_value = download_path + mock_urlretrieve.side_effect = urlretrieve + + # Act / Assert + with structlog.testing.capture_logs() as capture: + result = download(url=MagicMock()) + assert_log_exists(capture, "debug", "Re-downloading existing file, because the size has changed") + + # Assert + assert result == download_path + assert result.is_file() + assert result.stat().st_size == ZIP_SIZE + 1 + + +@patch("power_grid_model_io.utils.download.DownloadProgressHook", new=MagicMock()) +@patch("power_grid_model_io.utils.download.request.urlretrieve") +@patch("power_grid_model_io.utils.download.tqdm", new=MagicMock()) +@patch("power_grid_model_io.utils.download.get_download_path") +@patch("power_grid_model_io.utils.download.get_url_headers") +def test_download__overwrite( + mock_url_headers: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path +): + # Arrange + temp_file = temp_dir / "data.download" + download_path = temp_dir / "data.zip" + make_file(download_path, ZIP_SIZE) + + def urlretrieve(*_args, **_kwargs): + make_file(temp_file, ZIP_SIZE) + return temp_file, None + + mock_url_headers.return_value = (200, ZIP_SIZE, "remote.zip") + mock_download_path.return_value = download_path + mock_urlretrieve.side_effect = urlretrieve + + # Act / Assert + with structlog.testing.capture_logs() as capture: + result = download(url=MagicMock(), overwrite=True) + assert_log_exists(capture, "debug", "Forced re-downloading existing file") + + # Assert + assert result == download_path + assert result.is_file() + assert result.stat().st_size == ZIP_SIZE + + @patch("power_grid_model_io.utils.download.get_url_headers") def test_download__status_error(mock_url_headers: MagicMock): # Arrange From 939e8db0cdf86fbd0820765edff282eca13c57aa Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Tue, 28 Feb 2023 16:19:11 +0100 Subject: [PATCH 38/50] get_response_info unit tests Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 36 ++++++--- tests/unit/utils/test_download.py | 97 +++++++++++++++++------ 2 files changed, 95 insertions(+), 38 deletions(-) diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index 42ab30b3..1f683fae 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -8,9 +8,10 @@ import hashlib import re import zipfile +from dataclasses import dataclass from pathlib import Path from shutil import rmtree as remove_dir -from typing import ByteString, Optional, Tuple +from typing import ByteString, Optional from urllib import request import structlog @@ -19,6 +20,17 @@ _log = structlog.get_logger(__name__) +@dataclass +class ResponseInfo: + """ + Struct to store response information extracted from the response header + """ + + status: int + file_name: Optional[str] = None + file_size: Optional[int] = None + + class DownloadProgressHook: # pylint: disable=too-few-public-methods """ Report hook for request.urlretrieve() to update a progress bar based on the amount of downloaded blocks @@ -96,12 +108,12 @@ def download( Returns: The path to the downloaded file """ - status_code, remote_size, remote_file_name = get_url_headers(url=url) - if status_code != 200: - raise IOError(f"Could not download from URL, status={status_code}") + info = get_response_info(url=url) + if info.status != 200: + raise IOError(f"Could not download from URL, status={info.status}") - if file_path is None and remote_file_name: - file_path = Path(remote_file_name) + if file_path is None and info.file_name: + file_path = Path(info.file_name) file_path = get_download_path(dir_path=dir_path, file_path=file_path, data=url.encode()) log = _log.bind(url=url, file_path=file_path) @@ -112,13 +124,13 @@ def download( # Don't remove the existing file just yet... Let's first see if we can download a new version. else: local_size = file_path.stat().st_size - if local_size == remote_size: + if local_size == info.file_size: log.debug("Skip downloading existing file") return file_path log.debug( "Re-downloading existing file, because the size has changed", local_size=local_size, - remote_size=remote_size, + remote_size=info.file_size, ) else: log.debug("Downloading file") @@ -144,7 +156,7 @@ def download( return file_path -def get_url_headers(url: str) -> Tuple[int, int, Optional[str]]: +def get_response_info(url: str) -> ResponseInfo: """ Retrieve the file size of a given URL (based on it's header) @@ -155,13 +167,13 @@ def get_url_headers(url: str) -> Tuple[int, int, Optional[str]]: The file size in bytes """ with request.urlopen(url) as context: - status_code = context.status + status = context.status headers = context.headers - file_size = int(headers.get("Content-Length", 0)) + file_size = int(headers["Content-Length"]) if "Content-Length" in headers else None matches = re.findall(r"filename=\"(.+)\"", headers.get("Content-Disposition", "")) file_name = matches[0] if matches else None - return status_code, file_size, file_name + return ResponseInfo(status=status, file_size=file_size, file_name=file_name) def get_download_path(dir_path: Optional[Path], file_path: Optional[Path], data=Optional[ByteString]) -> Path: diff --git a/tests/unit/utils/test_download.py b/tests/unit/utils/test_download.py index bf5de783..1bdc2072 100644 --- a/tests/unit/utils/test_download.py +++ b/tests/unit/utils/test_download.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: MPL-2.0 import tempfile +from collections import namedtuple from pathlib import Path from unittest.mock import ANY, MagicMock, patch @@ -10,16 +11,19 @@ from power_grid_model_io.utils.download import ( DownloadProgressHook, + ResponseInfo, download, download_and_extract, extract, get_download_path, - get_url_headers, + get_response_info, ) from ...utils import assert_log_exists -ZIP_SIZE = 1024 +ZIP_SIZE = 1024 # 1kb + +Response = namedtuple("Response", ["status", "headers"]) @pytest.fixture() @@ -122,9 +126,9 @@ def test_download_and_extract__overwrite(mock_download: MagicMock, temp_dir: Pat @patch("power_grid_model_io.utils.download.request.urlretrieve") @patch("power_grid_model_io.utils.download.tqdm") @patch("power_grid_model_io.utils.download.get_download_path") -@patch("power_grid_model_io.utils.download.get_url_headers") +@patch("power_grid_model_io.utils.download.get_response_info") def test_download( - mock_url_headers: MagicMock, + mock_info: MagicMock, mock_download_path: MagicMock, mock_tqdm: MagicMock, mock_urlretrieve: MagicMock, @@ -142,7 +146,7 @@ def urlretrieve(*_args, **_kwargs): make_file(temp_file, ZIP_SIZE) return temp_file, None - mock_url_headers.return_value = (200, ZIP_SIZE, "remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE, file_name="remote.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -165,9 +169,9 @@ def urlretrieve(*_args, **_kwargs): @patch("power_grid_model_io.utils.download.request.urlretrieve") @patch("power_grid_model_io.utils.download.tqdm", new=MagicMock()) @patch("power_grid_model_io.utils.download.get_download_path") -@patch("power_grid_model_io.utils.download.get_url_headers") +@patch("power_grid_model_io.utils.download.get_response_info") def test_download__auto_file_name( - mock_url_headers: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path + mock_info: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path ): # Arrange temp_file = temp_dir / "data.download" @@ -177,7 +181,7 @@ def urlretrieve(*_args, **_kwargs): make_file(temp_file, 0) return temp_file, None - mock_url_headers.return_value = (200, 0, "remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=None, file_name="remote.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -192,9 +196,9 @@ def urlretrieve(*_args, **_kwargs): @patch("power_grid_model_io.utils.download.request.urlretrieve") @patch("power_grid_model_io.utils.download.tqdm", new=MagicMock()) @patch("power_grid_model_io.utils.download.get_download_path") -@patch("power_grid_model_io.utils.download.get_url_headers") +@patch("power_grid_model_io.utils.download.get_response_info") def test_download__empty_file( - mock_url_headers: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path + mock_info: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path ): # Arrange temp_file = temp_dir / "data.download" @@ -204,7 +208,7 @@ def urlretrieve(*_args, **_kwargs): make_file(temp_file, 0) return temp_file, None - mock_url_headers.return_value = (200, 0, "remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=None, file_name="remote.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -219,15 +223,15 @@ def urlretrieve(*_args, **_kwargs): @patch("power_grid_model_io.utils.download.request.urlretrieve") @patch("power_grid_model_io.utils.download.tqdm", new=MagicMock()) @patch("power_grid_model_io.utils.download.get_download_path") -@patch("power_grid_model_io.utils.download.get_url_headers") +@patch("power_grid_model_io.utils.download.get_response_info") def test_download__skip_existing_file( - mock_url_headers: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path + mock_info: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path ): # Arrange download_path = temp_dir / "data.zip" make_file(download_path, ZIP_SIZE) - mock_url_headers.return_value = (200, ZIP_SIZE, "remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE, file_name="remote.zip") mock_download_path.return_value = download_path # Act @@ -243,9 +247,9 @@ def test_download__skip_existing_file( @patch("power_grid_model_io.utils.download.request.urlretrieve") @patch("power_grid_model_io.utils.download.tqdm", new=MagicMock()) @patch("power_grid_model_io.utils.download.get_download_path") -@patch("power_grid_model_io.utils.download.get_url_headers") +@patch("power_grid_model_io.utils.download.get_response_info") def test_download__update_file( - mock_url_headers: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path + mock_info: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path ): # Arrange temp_file = temp_dir / "data.download" @@ -256,7 +260,7 @@ def urlretrieve(*_args, **_kwargs): make_file(temp_file, ZIP_SIZE + 1) return temp_file, None - mock_url_headers.return_value = (200, ZIP_SIZE + 1, "remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE + 1, file_name="remote.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -275,9 +279,9 @@ def urlretrieve(*_args, **_kwargs): @patch("power_grid_model_io.utils.download.request.urlretrieve") @patch("power_grid_model_io.utils.download.tqdm", new=MagicMock()) @patch("power_grid_model_io.utils.download.get_download_path") -@patch("power_grid_model_io.utils.download.get_url_headers") +@patch("power_grid_model_io.utils.download.get_response_info") def test_download__overwrite( - mock_url_headers: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path + mock_info: MagicMock, mock_download_path: MagicMock, mock_urlretrieve: MagicMock, temp_dir: Path ): # Arrange temp_file = temp_dir / "data.download" @@ -288,7 +292,7 @@ def urlretrieve(*_args, **_kwargs): make_file(temp_file, ZIP_SIZE) return temp_file, None - mock_url_headers.return_value = (200, ZIP_SIZE, "remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE, file_name="remote.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -303,18 +307,59 @@ def urlretrieve(*_args, **_kwargs): assert result.stat().st_size == ZIP_SIZE -@patch("power_grid_model_io.utils.download.get_url_headers") -def test_download__status_error(mock_url_headers: MagicMock): +@patch("power_grid_model_io.utils.download.get_response_info") +def test_download__status_error(mock_info: MagicMock): # Arrange - mock_url_headers.return_value = (403, 0, None) + mock_info.return_value = ResponseInfo(status=404, file_size=None, file_name=None) # Act / Assert - with pytest.raises(IOError, match=r"Could not download from URL, status=403"): + with pytest.raises(IOError, match=r"Could not download from URL, status=404"): download(url=MagicMock()) -def test_get_url_headers(): - get_url_headers +@patch("power_grid_model_io.utils.download.request.urlopen") +def test_get_response_info(mock_urlopen): + + # Arrange + headers = {"Content-Length": "456", "Content-Disposition": 'form-data; name="ZipFile"; filename="filename.zip"'} + mock_urlopen.return_value.__enter__.return_value = Response(status=123, headers=headers) + + # Act / Assert + assert get_response_info("") == ResponseInfo(status=123, file_size=456, file_name="filename.zip") + + +@patch("power_grid_model_io.utils.download.request.urlopen") +def test_get_response_info__no_file_name(mock_urlopen): + + # Arrange + headers = {"Content-Length": "456", "Content-Disposition": 'form-data; name="ZipFile"'} + mock_urlopen.return_value.__enter__.return_value = Response(status=123, headers=headers) + + # Act / Assert + assert get_response_info("") == ResponseInfo(status=123, file_size=456, file_name=None) + + +@patch("power_grid_model_io.utils.download.request.urlopen") +def test_get_response_info__no_disposition(mock_urlopen): + + # Arrange + headers = {"Content-Length": "456"} + Context = namedtuple("Context", ["status", "headers"]) + mock_urlopen.return_value.__enter__.return_value = Context(status=123, headers=headers) + + # Act / Assert + assert get_response_info("") == ResponseInfo(status=123, file_size=456, file_name=None) + + +@patch("power_grid_model_io.utils.download.request.urlopen") +def test_get_response_info__no_length(mock_urlopen): + + # Arrange + headers = {"Content-Disposition": 'form-data; name="ZipFile"; filename="filename.zip"'} + mock_urlopen.return_value.__enter__.return_value = Response(status=123, headers=headers) + + # Act / Assert + assert get_response_info("") == ResponseInfo(status=123, file_size=None, file_name="filename.zip") def test_get_download_path(): From e8327a5f652ec6658c15467bc0401b51434083d9 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Tue, 28 Feb 2023 17:03:25 +0100 Subject: [PATCH 39/50] Unit tests for get_download_path() Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 15 ++++++-- tests/unit/utils/test_download.py | 45 ++++++++++++++++++++--- 2 files changed, 50 insertions(+), 10 deletions(-) diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index 1f683fae..49107b27 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -11,7 +11,7 @@ from dataclasses import dataclass from pathlib import Path from shutil import rmtree as remove_dir -from typing import ByteString, Optional +from typing import Optional from urllib import request import structlog @@ -176,7 +176,9 @@ def get_response_info(url: str) -> ResponseInfo: return ResponseInfo(status=status, file_size=file_size, file_name=file_name) -def get_download_path(dir_path: Optional[Path], file_path: Optional[Path], data=Optional[ByteString]) -> Path: +def get_download_path( + dir_path: Optional[Path] = None, file_path: Optional[Path] = None, data: Optional[bytes] = None +) -> Path: """ Determine the file path based on dir_path, file_path and/or data @@ -190,19 +192,24 @@ def get_download_path(dir_path: Optional[Path], file_path: Optional[Path], data= # If no file_path is given, generate a file name if file_path is None: + if data is None: + raise ValueError("Supply data in order to auto generate a download path.") try: sha256 = hashlib.sha256() sha256.update(data) hash_str = sha256.hexdigest() except (TypeError, ValueError) as ex: - raise ValueError(f"Can't auto generate a file name for a {type(data).__name__}.") from ex - file_path = Path(__name__) / f"{hash_str}.download" + raise ValueError( + f"Can't auto generate a download path for '{type(data).__name__}' data (convert it to bytes first)." + ) from ex + file_path = Path(f"{hash_str}.download") # If no dir_path is given, use current working dir elif dir_path is None: dir_path = Path(".") # Combine the two paths + assert file_path is not None file_path = (dir_path / file_path).resolve() if dir_path else file_path.resolve() # If the file_path exists, it should be a file (not a dir) diff --git a/tests/unit/utils/test_download.py b/tests/unit/utils/test_download.py index 1bdc2072..84028290 100644 --- a/tests/unit/utils/test_download.py +++ b/tests/unit/utils/test_download.py @@ -319,7 +319,6 @@ def test_download__status_error(mock_info: MagicMock): @patch("power_grid_model_io.utils.download.request.urlopen") def test_get_response_info(mock_urlopen): - # Arrange headers = {"Content-Length": "456", "Content-Disposition": 'form-data; name="ZipFile"; filename="filename.zip"'} mock_urlopen.return_value.__enter__.return_value = Response(status=123, headers=headers) @@ -330,7 +329,6 @@ def test_get_response_info(mock_urlopen): @patch("power_grid_model_io.utils.download.request.urlopen") def test_get_response_info__no_file_name(mock_urlopen): - # Arrange headers = {"Content-Length": "456", "Content-Disposition": 'form-data; name="ZipFile"'} mock_urlopen.return_value.__enter__.return_value = Response(status=123, headers=headers) @@ -341,7 +339,6 @@ def test_get_response_info__no_file_name(mock_urlopen): @patch("power_grid_model_io.utils.download.request.urlopen") def test_get_response_info__no_disposition(mock_urlopen): - # Arrange headers = {"Content-Length": "456"} Context = namedtuple("Context", ["status", "headers"]) @@ -353,7 +350,6 @@ def test_get_response_info__no_disposition(mock_urlopen): @patch("power_grid_model_io.utils.download.request.urlopen") def test_get_response_info__no_length(mock_urlopen): - # Arrange headers = {"Content-Disposition": 'form-data; name="ZipFile"; filename="filename.zip"'} mock_urlopen.return_value.__enter__.return_value = Response(status=123, headers=headers) @@ -362,8 +358,45 @@ def test_get_response_info__no_length(mock_urlopen): assert get_response_info("") == ResponseInfo(status=123, file_size=None, file_name="filename.zip") -def test_get_download_path(): - get_download_path +def test_get_download_path(temp_dir): + # Act + path = get_download_path(dir_path=temp_dir, file_path=Path("file_name.zip"), data=b"foo") + + # Assert + assert path == temp_dir / "file_name.zip" + + +def test_get_download_path__auto_dir(): + # Act + path = get_download_path(file_path=Path("file_name.zip")) + + # Assert + assert path == Path(".").absolute() / "file_name.zip" + + +def test_get_download_path__auto_file_name(temp_dir): + # Arrange + foo_sha256_hash = "2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae" + + # Act + path = get_download_path(dir_path=temp_dir, data=b"foo") + + # Assert + assert path == temp_dir / f"{foo_sha256_hash}.download" + + +def test_get_download_path__invalid_data_type(temp_dir): + # Act / Assert + with pytest.raises( + ValueError, match=r"Can't auto generate a download path for 'str' data \(convert it to bytes first\)\." + ): + get_download_path(data="foo") # type: ignore + + +def test_get_download_path__missing_data(temp_dir): + # Act / Assert + with pytest.raises(ValueError, match=r"Supply data in order to auto generate a download path\."): + get_download_path(dir_path=temp_dir) def test_extract(): From bebfe6cdaa7f0956fc4530413a3573e8d9296856 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Wed, 1 Mar 2023 10:20:16 +0100 Subject: [PATCH 40/50] Refactoring Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 83 ++++++++++++----------- tests/unit/utils/test_download.py | 56 ++++++++------- 2 files changed, 75 insertions(+), 64 deletions(-) diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index 49107b27..76ef25f4 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -5,13 +5,15 @@ Helper functions to download (and store) files from the internet """ +import base64 import hashlib import re +import tempfile import zipfile from dataclasses import dataclass from pathlib import Path from shutil import rmtree as remove_dir -from typing import Optional +from typing import Optional, Union from urllib import request import structlog @@ -61,7 +63,7 @@ def __call__(self, block_num: int, block_size: int, file_size: int) -> None: def download_and_extract( - url: str, dir_path: Optional[Path] = None, file_path: Optional[Path] = None, overwrite: bool = False + url: str, dir_path: Optional[Path] = None, file_name: Optional[Union[str, Path]] = None, overwrite: bool = False ) -> Path: """ Download a file from a URL and store it locally, extract the contents and return the path to the contents. @@ -70,7 +72,7 @@ def download_and_extract( url: The url to the .zip file dir_path: An optional dir path to store the downloaded file. If no dir_path is given the current working dir will be used. - file_path: An optional file path (absolute or relative to dir_path). If no file_path is given, a file name is + file_name: An optional file name (or path relative to dir_path). If no file_name is given, a file name is generated based on the url overwrite: Should we download the file, even if we have downloaded already (and the file size still matches)? Be careful with this option, as it will remove files from your drive irreversibly! @@ -80,7 +82,7 @@ def download_and_extract( """ # Download the file and use the file name as the base name for the extraction directory - src_file_path = download(url=url, file_path=file_path, dir_path=dir_path, overwrite=overwrite) + src_file_path = download(url=url, file_name=file_name, dir_path=dir_path, overwrite=overwrite) dst_dir_path = src_file_path.with_suffix("") # If we explicitly want to overwrite the extracted files, remove the @@ -92,14 +94,14 @@ def download_and_extract( def download( - url: str, file_path: Optional[Path] = None, dir_path: Optional[Path] = None, overwrite: bool = False + url: str, file_name: Optional[Union[str, Path]] = None, dir_path: Optional[Path] = None, overwrite: bool = False ) -> Path: """ Download a file from a URL and store it locally Args: url: The url to the file - file_path: An optional file path (absolute or relative to dir_path). If no file_path is given, a file name is + file_name: An optional file name (or path relative to dir_path). If no file_name is given, a file name is generated based on the url dir_path: An optional dir path to store the downloaded file. If no dir_path is given the current working dir will be used. @@ -108,14 +110,16 @@ def download( Returns: The path to the downloaded file """ + + # get the response info, if the status is not 200 info = get_response_info(url=url) if info.status != 200: raise IOError(f"Could not download from URL, status={info.status}") - if file_path is None and info.file_name: - file_path = Path(info.file_name) + if file_name is None and info.file_name: + file_name = info.file_name - file_path = get_download_path(dir_path=dir_path, file_path=file_path, data=url.encode()) + file_path = get_download_path(dir_path=dir_path, file_name=file_name, unique_key=url) log = _log.bind(url=url, file_path=file_path) if file_path.is_file(): @@ -177,40 +181,39 @@ def get_response_info(url: str) -> ResponseInfo: def get_download_path( - dir_path: Optional[Path] = None, file_path: Optional[Path] = None, data: Optional[bytes] = None + dir_path: Optional[Path] = None, + file_name: Optional[Union[str, Path]] = None, + unique_key: Optional[str] = None, ) -> Path: """ - Determine the file path based on dir_path, file_path and/or data + Determine the file path based on dir_path, file_name and/or data Args: - dir_path: An optional dir path to store the downloaded file. If no dir_path is given the system's temp dir will - be used. Use Path(".") if you intend to use the current working directory. - file_path: An optional file path (absolute or relative to dir_path). If no file_path is given, a file name is - generated based on the url - data: A bytestring that can be used to generate a filename. + dir_path: An optional dir path to store the downloaded file. If no dir_path is given the system's temp dir + will be used. If omitted, the tempfolder is used. + file_name: An optional file name (or path relative to dir_path). If no file_name is given, a file name is + generated based on the unique key (e.g. an url) + unique_key: A unique string that can be used to generate a filename (e.g. a url). """ - # If no file_path is given, generate a file name - if file_path is None: - if data is None: + # If no file_name is given, generate a file name + if file_name is None: + if unique_key is None: raise ValueError("Supply data in order to auto generate a download path.") - try: - sha256 = hashlib.sha256() - sha256.update(data) - hash_str = sha256.hexdigest() - except (TypeError, ValueError) as ex: - raise ValueError( - f"Can't auto generate a download path for '{type(data).__name__}' data (convert it to bytes first)." - ) from ex - file_path = Path(f"{hash_str}.download") - - # If no dir_path is given, use current working dir + + sha256 = hashlib.sha256() + sha256.update(unique_key.encode()) + hash_str = base64.b64encode(sha256.digest()).decode("ascii") + hash_str = hash_str.replace("/", "_").replace("+", "-").rstrip("=") + file_name = Path(f"{hash_str}.download") + + # If no dir_path is given, use the system's designated folder for temporary files elif dir_path is None: - dir_path = Path(".") + dir_path = Path(tempfile.gettempdir()) # Combine the two paths - assert file_path is not None - file_path = (dir_path / file_path).resolve() if dir_path else file_path.resolve() + assert file_name is not None + file_path = (dir_path / file_name).resolve() if dir_path else Path(file_name).resolve() # If the file_path exists, it should be a file (not a dir) if file_path.exists() and not file_path.is_file(): @@ -225,14 +228,15 @@ def extract(src_file_path: Path, dst_dir_path: Optional[Path] = None, skip_if_ex Args: src_file_path: The .zip file to extract. - src_file_path: An optional destination path. If none is given, the src_file_path wihout .zip extension is used. - skip_if_exists: If true, it returns the dir path, otherwise raise an exception. + dst_dir_path: An optional destination path. If none is given, the src_file_path without .zip extension is used. + skip_if_exists: Skip existing files, otherwise raise an exception when a file exists. Returns: The path where the files are extracted """ if src_file_path.suffix.lower() != ".zip": raise ValueError(f"Only files with .zip extension are supported, got {src_file_path}") + if dst_dir_path is None: dst_dir_path = src_file_path.with_suffix("") @@ -241,9 +245,6 @@ def extract(src_file_path: Path, dst_dir_path: Optional[Path] = None, skip_if_ex if dst_dir_path.exists(): if not dst_dir_path.is_dir(): raise NotADirectoryError(f"Destination dir {dst_dir_path} exists and is not a directory") - if not skip_if_exists: - raise FileExistsError(f"Destination dir {dst_dir_path} exists and is not empty") - log.debug("Skip extraction, destination dir exists") else: # Create the destination directory @@ -253,6 +254,12 @@ def extract(src_file_path: Path, dst_dir_path: Optional[Path] = None, skip_if_ex with zipfile.ZipFile(src_file_path, "r") as zip_file: file_list = zip_file.namelist() for file_path in tqdm(desc="Extracting", iterable=file_list, total=len(file_list), unit="file", leave=True): + dst_file_path = dst_dir_path / file_path + if dst_file_path.exists() and dst_file_path.stat().st_size > 0: + if skip_if_exists: + log.debug("Skip file extraction, destination file exists", dst_file_path=dst_file_path) + continue + raise FileExistsError(f"Destination file {dst_dir_path / file_path} exists and is not empty") zip_file.extract(member=file_path, path=dst_dir_path) # If the zip files contains a single directory with the same name as the zip file, return that dir diff --git a/tests/unit/utils/test_download.py b/tests/unit/utils/test_download.py index 84028290..112cb8be 100644 --- a/tests/unit/utils/test_download.py +++ b/tests/unit/utils/test_download.py @@ -77,10 +77,10 @@ def test_download_and_extract__paths(mock_download: MagicMock, mock_extract: Mag mock_extract.return_value = extract_dir_path # Act - result = download_and_extract(url=url, dir_path=dir_path, file_path=file_path) + result = download_and_extract(url=url, dir_path=dir_path, file_name=file_path) # Assert - mock_download.assert_called_once_with(url=url, file_path=file_path, dir_path=dir_path, overwrite=False) + mock_download.assert_called_once_with(url=url, file_name=file_path, dir_path=dir_path, overwrite=False) mock_extract.assert_called_once_with(src_file_path=src_file_path, dst_dir_path=dst_dir_path, skip_if_exists=True) assert result == extract_dir_path @@ -99,7 +99,7 @@ def test_download_and_extract__no_paths(mock_download: MagicMock, mock_extract: download_and_extract(url=url) # Assert - mock_download.assert_called_once_with(url=url, file_path=None, dir_path=None, overwrite=False) + mock_download.assert_called_once_with(url=url, file_name=None, dir_path=None, overwrite=False) mock_extract.assert_called_once_with(src_file_path=src_file_path, dst_dir_path=dst_dir_path, skip_if_exists=True) @@ -146,17 +146,19 @@ def urlretrieve(*_args, **_kwargs): make_file(temp_file, ZIP_SIZE) return temp_file, None - mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE, file_name="remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE, file_name="remote1.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve # Act / assert with structlog.testing.capture_logs() as capture: - result = download(url=url, file_path=file_path, dir_path=dir_path) + result = download(url=url, file_name=file_path, dir_path=dir_path) assert_log_exists(capture, "debug", "Downloading file") # Assert - mock_download_path.assert_called_once_with(dir_path=dir_path, file_path=file_path, data=b"https://www.source.com") + mock_download_path.assert_called_once_with( + dir_path=dir_path, file_name=file_path, unique_key="https://www.source.com" + ) mock_tqdm.assert_called_once() mock_hook.assert_called_once_with(mock_tqdm.return_value.__enter__.return_value) mock_urlretrieve.assert_called_once_with(url, reporthook=mock_hook.return_value) @@ -181,7 +183,7 @@ def urlretrieve(*_args, **_kwargs): make_file(temp_file, 0) return temp_file, None - mock_info.return_value = ResponseInfo(status=200, file_size=None, file_name="remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=None, file_name="remote2.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -189,7 +191,7 @@ def urlretrieve(*_args, **_kwargs): download(url=MagicMock()) # Assert - mock_download_path.assert_called_once_with(dir_path=None, file_path=Path("remote.zip"), data=ANY) + mock_download_path.assert_called_once_with(dir_path=None, file_name="remote2.zip", unique_key=ANY) @patch("power_grid_model_io.utils.download.DownloadProgressHook", new=MagicMock()) @@ -208,7 +210,7 @@ def urlretrieve(*_args, **_kwargs): make_file(temp_file, 0) return temp_file, None - mock_info.return_value = ResponseInfo(status=200, file_size=None, file_name="remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=None, file_name="remote3.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -231,7 +233,7 @@ def test_download__skip_existing_file( download_path = temp_dir / "data.zip" make_file(download_path, ZIP_SIZE) - mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE, file_name="remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE, file_name="remote4.zip") mock_download_path.return_value = download_path # Act @@ -260,7 +262,7 @@ def urlretrieve(*_args, **_kwargs): make_file(temp_file, ZIP_SIZE + 1) return temp_file, None - mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE + 1, file_name="remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE + 1, file_name="remote5.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -292,7 +294,7 @@ def urlretrieve(*_args, **_kwargs): make_file(temp_file, ZIP_SIZE) return temp_file, None - mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE, file_name="remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE, file_name="remote6.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -360,7 +362,7 @@ def test_get_response_info__no_length(mock_urlopen): def test_get_download_path(temp_dir): # Act - path = get_download_path(dir_path=temp_dir, file_path=Path("file_name.zip"), data=b"foo") + path = get_download_path(dir_path=temp_dir, file_name="file_name.zip", unique_key="foo") # Assert assert path == temp_dir / "file_name.zip" @@ -368,29 +370,23 @@ def test_get_download_path(temp_dir): def test_get_download_path__auto_dir(): # Act - path = get_download_path(file_path=Path("file_name.zip")) + path = get_download_path(file_name="file_name.zip") # Assert - assert path == Path(".").absolute() / "file_name.zip" + assert path == Path(tempfile.gettempdir()).absolute() / "file_name.zip" def test_get_download_path__auto_file_name(temp_dir): # Arrange - foo_sha256_hash = "2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae" + # The base64 representation of the sha256 hash of "foo" is LCa0a2j/xo/5m0U8HTBBNBNCLXBkg7+g+YpeiGJm564= + # The / and + will be replaced with a _ and - character and the trailing = character(s) will be removed. + expected_file_name = "LCa0a2j_xo_5m0U8HTBBNBNCLXBkg7-g-YpeiGJm564.download" # Act - path = get_download_path(dir_path=temp_dir, data=b"foo") + path = get_download_path(dir_path=temp_dir, unique_key="foo") # Assert - assert path == temp_dir / f"{foo_sha256_hash}.download" - - -def test_get_download_path__invalid_data_type(temp_dir): - # Act / Assert - with pytest.raises( - ValueError, match=r"Can't auto generate a download path for 'str' data \(convert it to bytes first\)\." - ): - get_download_path(data="foo") # type: ignore + assert path == temp_dir / expected_file_name def test_get_download_path__missing_data(temp_dir): @@ -399,5 +395,13 @@ def test_get_download_path__missing_data(temp_dir): get_download_path(dir_path=temp_dir) +def test_get_download_path__invalid_file_path(temp_dir): + # Arrange + make_dir(temp_dir / "download") + # Act / Assert + with pytest.raises(ValueError, match=r"Invalid file path:"): + get_download_path(dir_path=temp_dir, file_name="download") + + def test_extract(): extract From ae0d282c87913d79178d3271e84d40dacc4d606e Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Wed, 1 Mar 2023 11:19:11 +0100 Subject: [PATCH 41/50] Add two very small zip files for unit testing Signed-off-by: Bram Stoeller --- tests/data/downloader/one-file.zip | Bin 0 -> 126 bytes tests/data/downloader/one-file.zip.license | 3 + tests/data/downloader/two-files.zip | Bin 0 -> 247 bytes tests/data/downloader/two-files.zip.license | 3 + tests/unit/utils/test_download.py | 61 ++++++++++---------- 5 files changed, 37 insertions(+), 30 deletions(-) create mode 100644 tests/data/downloader/one-file.zip create mode 100644 tests/data/downloader/one-file.zip.license create mode 100644 tests/data/downloader/two-files.zip create mode 100644 tests/data/downloader/two-files.zip.license diff --git a/tests/data/downloader/one-file.zip b/tests/data/downloader/one-file.zip new file mode 100644 index 0000000000000000000000000000000000000000..1c1130a3a8542f079d908eb6f6ece0c30122fe52 GIT binary patch literal 126 zcmWIWW@Zs#00Eca#ISwL8ToubHVCr=aaw-9UP(oXPkvEqu0m!(acQoeo?d`ABa;XN gZcRXa42%p4APNYeMg(}XvVpjaKxhi2jX@j+0Fbs5I{*Lx literal 0 HcmV?d00001 diff --git a/tests/data/downloader/one-file.zip.license b/tests/data/downloader/one-file.zip.license new file mode 100644 index 00000000..e6220e93 --- /dev/null +++ b/tests/data/downloader/one-file.zip.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project + +SPDX-License-Identifier: MPL-2.0 \ No newline at end of file diff --git a/tests/data/downloader/two-files.zip b/tests/data/downloader/two-files.zip new file mode 100644 index 0000000000000000000000000000000000000000..5f7c0598608bcbeb5f92ef6fa083c84039038b06 GIT binary patch literal 247 zcmWIWW@Zs#00FPy#IOdj7;`}&8-)3QI4wUXCACODDX~beq@qMmPcJ1uC%;IcII~0{ zF*mg&0It~usCgfAMm|Ur2(yDV=Yw?lU~k@aG;5Tpa5*A}c7Y<++?D;r3H2?z^;bODIN0079*E2jVe literal 0 HcmV?d00001 diff --git a/tests/data/downloader/two-files.zip.license b/tests/data/downloader/two-files.zip.license new file mode 100644 index 00000000..e6220e93 --- /dev/null +++ b/tests/data/downloader/two-files.zip.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project + +SPDX-License-Identifier: MPL-2.0 \ No newline at end of file diff --git a/tests/unit/utils/test_download.py b/tests/unit/utils/test_download.py index 112cb8be..99409e5d 100644 --- a/tests/unit/utils/test_download.py +++ b/tests/unit/utils/test_download.py @@ -1,6 +1,7 @@ # SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project # # SPDX-License-Identifier: MPL-2.0 +import shutil import tempfile from collections import namedtuple from pathlib import Path @@ -21,10 +22,14 @@ from ...utils import assert_log_exists -ZIP_SIZE = 1024 # 1kb - Response = namedtuple("Response", ["status", "headers"]) +DATA_DIR = Path(__file__).parents[2] / "data" / "downloader" +ZIP1 = DATA_DIR / "one-file.zip" +ZIP2 = DATA_DIR / "two-files.zip" +ZIP1_SIZE = ZIP1.stat().st_size +ZIP2_SIZE = ZIP2.stat().st_size + @pytest.fixture() def temp_dir(): @@ -32,11 +37,6 @@ def temp_dir(): yield Path(tmp) -def make_file(file_name: Path, file_size: int = 0): - with open(file_name, "wb") as fp: - fp.write(b"\0" * file_size) - - def make_dir(dirname: Path): dirname.mkdir(parents=True) @@ -143,10 +143,10 @@ def test_download( download_path = temp_dir / "data.zip" def urlretrieve(*_args, **_kwargs): - make_file(temp_file, ZIP_SIZE) + shutil.copyfile(ZIP1, temp_file) return temp_file, None - mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE, file_name="remote1.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=ZIP1_SIZE, file_name="remote.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -164,7 +164,7 @@ def urlretrieve(*_args, **_kwargs): mock_urlretrieve.assert_called_once_with(url, reporthook=mock_hook.return_value) assert result == file_path assert result.is_file() - assert result.stat().st_size == ZIP_SIZE + assert result.stat().st_size == ZIP1_SIZE @patch("power_grid_model_io.utils.download.DownloadProgressHook", new=MagicMock()) @@ -180,10 +180,10 @@ def test_download__auto_file_name( download_path = temp_dir / "data.zip" def urlretrieve(*_args, **_kwargs): - make_file(temp_file, 0) + shutil.copyfile(ZIP1, temp_file) return temp_file, None - mock_info.return_value = ResponseInfo(status=200, file_size=None, file_name="remote2.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=None, file_name="remote.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -191,7 +191,7 @@ def urlretrieve(*_args, **_kwargs): download(url=MagicMock()) # Assert - mock_download_path.assert_called_once_with(dir_path=None, file_name="remote2.zip", unique_key=ANY) + mock_download_path.assert_called_once_with(dir_path=None, file_name="remote.zip", unique_key=ANY) @patch("power_grid_model_io.utils.download.DownloadProgressHook", new=MagicMock()) @@ -207,10 +207,11 @@ def test_download__empty_file( download_path = temp_dir / "data.zip" def urlretrieve(*_args, **_kwargs): - make_file(temp_file, 0) + with open(temp_file, "wb"): + pass return temp_file, None - mock_info.return_value = ResponseInfo(status=200, file_size=None, file_name="remote3.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=None, file_name="remote.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -231,9 +232,9 @@ def test_download__skip_existing_file( ): # Arrange download_path = temp_dir / "data.zip" - make_file(download_path, ZIP_SIZE) + shutil.copyfile(ZIP1, download_path) - mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE, file_name="remote4.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=ZIP1_SIZE, file_name="remote.zip") mock_download_path.return_value = download_path # Act @@ -256,13 +257,13 @@ def test_download__update_file( # Arrange temp_file = temp_dir / "data.download" download_path = temp_dir / "data.zip" - make_file(download_path, ZIP_SIZE) + shutil.copyfile(ZIP1, download_path) def urlretrieve(*_args, **_kwargs): - make_file(temp_file, ZIP_SIZE + 1) + shutil.copyfile(ZIP2, temp_file) return temp_file, None - mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE + 1, file_name="remote5.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=ZIP2_SIZE, file_name="remote.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -274,7 +275,7 @@ def urlretrieve(*_args, **_kwargs): # Assert assert result == download_path assert result.is_file() - assert result.stat().st_size == ZIP_SIZE + 1 + assert result.stat().st_size == ZIP2_SIZE @patch("power_grid_model_io.utils.download.DownloadProgressHook", new=MagicMock()) @@ -288,13 +289,13 @@ def test_download__overwrite( # Arrange temp_file = temp_dir / "data.download" download_path = temp_dir / "data.zip" - make_file(download_path, ZIP_SIZE) + shutil.copyfile(ZIP1, download_path) def urlretrieve(*_args, **_kwargs): - make_file(temp_file, ZIP_SIZE) + shutil.copyfile(ZIP1, temp_file) return temp_file, None - mock_info.return_value = ResponseInfo(status=200, file_size=ZIP_SIZE, file_name="remote6.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=ZIP1_SIZE, file_name="remote.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -306,7 +307,7 @@ def urlretrieve(*_args, **_kwargs): # Assert assert result == download_path assert result.is_file() - assert result.stat().st_size == ZIP_SIZE + assert result.stat().st_size == ZIP1_SIZE @patch("power_grid_model_io.utils.download.get_response_info") @@ -360,7 +361,7 @@ def test_get_response_info__no_length(mock_urlopen): assert get_response_info("") == ResponseInfo(status=123, file_size=None, file_name="filename.zip") -def test_get_download_path(temp_dir): +def test_get_download_path(temp_dir: Path): # Act path = get_download_path(dir_path=temp_dir, file_name="file_name.zip", unique_key="foo") @@ -376,7 +377,7 @@ def test_get_download_path__auto_dir(): assert path == Path(tempfile.gettempdir()).absolute() / "file_name.zip" -def test_get_download_path__auto_file_name(temp_dir): +def test_get_download_path__auto_file_name(temp_dir: Path): # Arrange # The base64 representation of the sha256 hash of "foo" is LCa0a2j/xo/5m0U8HTBBNBNCLXBkg7+g+YpeiGJm564= # The / and + will be replaced with a _ and - character and the trailing = character(s) will be removed. @@ -389,13 +390,13 @@ def test_get_download_path__auto_file_name(temp_dir): assert path == temp_dir / expected_file_name -def test_get_download_path__missing_data(temp_dir): +def test_get_download_path__missing_data(temp_dir: Path): # Act / Assert with pytest.raises(ValueError, match=r"Supply data in order to auto generate a download path\."): get_download_path(dir_path=temp_dir) -def test_get_download_path__invalid_file_path(temp_dir): +def test_get_download_path__invalid_file_path(temp_dir: Path): # Arrange make_dir(temp_dir / "download") # Act / Assert @@ -403,5 +404,5 @@ def test_get_download_path__invalid_file_path(temp_dir): get_download_path(dir_path=temp_dir, file_name="download") -def test_extract(): +def test_extract(temp_dir: Path): extract From 3a7f99927cb258b53386190a454397ed502f7fad Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Wed, 1 Mar 2023 12:06:44 +0100 Subject: [PATCH 42/50] Rename example zip files Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 42 +++++++---- .../downloader/{two-files.zip => foo-bar.zip} | Bin ...e-file.zip.license => foo-bar.zip.license} | 0 tests/data/downloader/foo.zip | Bin 0 -> 134 bytes ...{two-files.zip.license => foo.zip.license} | 0 tests/data/downloader/one-file.zip | Bin 126 -> 0 bytes tests/unit/utils/test_download.py | 70 +++++++++++++++--- tests/utils.py | 13 ++++ 8 files changed, 97 insertions(+), 28 deletions(-) rename tests/data/downloader/{two-files.zip => foo-bar.zip} (100%) rename tests/data/downloader/{one-file.zip.license => foo-bar.zip.license} (100%) create mode 100644 tests/data/downloader/foo.zip rename tests/data/downloader/{two-files.zip.license => foo.zip.license} (100%) delete mode 100644 tests/data/downloader/one-file.zip diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index 76ef25f4..40c6db58 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -3,6 +3,17 @@ # SPDX-License-Identifier: MPL-2.0 """ Helper functions to download (and store) files from the internet + +The most simple (and intended) usage is: +url = "http://141.51.193.167/simbench/gui/usecase/download/?simbench_code=1-complete_data-mixed-all-0-sw&format=csv" +csv_dir = download_and_extract(url) + +It will download the zip file 1-complete_data-mixed-all-0-sw.zip to a folder in you systems temp dir; for example +"/tmp/1-complete_data-mixed-all-0-sw.zip". Then it extracts the files there as well, in a folder that corresponds to +the zip file name ("/tmp/1-complete_data-mixed-all-0-sw/" in our example), and it returns the path to that directory. +By default, it will not re-download or re-extract the zip file as long as the files exist in you temp dir. Your temp +dir is typically emptied whe you reboot your computer. + """ import base64 @@ -222,7 +233,7 @@ def get_download_path( return file_path -def extract(src_file_path: Path, dst_dir_path: Optional[Path] = None, skip_if_exists=True) -> Path: +def extract(src_file_path: Path, dst_dir_path: Optional[Path] = None, skip_if_exists=False) -> Path: """ Extract a .zip file and return the destination dir @@ -246,21 +257,20 @@ def extract(src_file_path: Path, dst_dir_path: Optional[Path] = None, skip_if_ex if not dst_dir_path.is_dir(): raise NotADirectoryError(f"Destination dir {dst_dir_path} exists and is not a directory") - else: - # Create the destination directory - dst_dir_path.mkdir(parents=True, exist_ok=True) - - # Extract per file, so we can show a progress bar - with zipfile.ZipFile(src_file_path, "r") as zip_file: - file_list = zip_file.namelist() - for file_path in tqdm(desc="Extracting", iterable=file_list, total=len(file_list), unit="file", leave=True): - dst_file_path = dst_dir_path / file_path - if dst_file_path.exists() and dst_file_path.stat().st_size > 0: - if skip_if_exists: - log.debug("Skip file extraction, destination file exists", dst_file_path=dst_file_path) - continue - raise FileExistsError(f"Destination file {dst_dir_path / file_path} exists and is not empty") - zip_file.extract(member=file_path, path=dst_dir_path) + # Create the destination directory + dst_dir_path.mkdir(parents=True, exist_ok=True) + + # Extract per file, so we can show a progress bar + with zipfile.ZipFile(src_file_path, "r") as zip_file: + file_list = zip_file.namelist() + for file_path in tqdm(desc="Extracting", iterable=file_list, total=len(file_list), unit="file", leave=True): + dst_file_path = dst_dir_path / file_path + if dst_file_path.exists() and dst_file_path.stat().st_size > 0: + if skip_if_exists: + log.debug("Skip file extraction, destination file exists", dst_file_path=dst_file_path) + continue + raise FileExistsError(f"Destination file {dst_dir_path / file_path} exists and is not empty") + zip_file.extract(member=file_path, path=dst_dir_path) # If the zip files contains a single directory with the same name as the zip file, return that dir contents = list(dst_dir_path.iterdir()) diff --git a/tests/data/downloader/two-files.zip b/tests/data/downloader/foo-bar.zip similarity index 100% rename from tests/data/downloader/two-files.zip rename to tests/data/downloader/foo-bar.zip diff --git a/tests/data/downloader/one-file.zip.license b/tests/data/downloader/foo-bar.zip.license similarity index 100% rename from tests/data/downloader/one-file.zip.license rename to tests/data/downloader/foo-bar.zip.license diff --git a/tests/data/downloader/foo.zip b/tests/data/downloader/foo.zip new file mode 100644 index 0000000000000000000000000000000000000000..8e13ab608434d4387c6fe15626003c35f659f68f GIT binary patch literal 134 zcmWIWW@Zs#00Eca#ISwL8ToubHVAVAaaw-9J`n4bRFwGS7p3MZWEK>c=IZI`1$Z+u ji7?>S2-M5K$e;kCfB?yu0B=?{kPssfS^{Zv5QhN(qP!L{ literal 0 HcmV?d00001 diff --git a/tests/data/downloader/two-files.zip.license b/tests/data/downloader/foo.zip.license similarity index 100% rename from tests/data/downloader/two-files.zip.license rename to tests/data/downloader/foo.zip.license diff --git a/tests/data/downloader/one-file.zip b/tests/data/downloader/one-file.zip deleted file mode 100644 index 1c1130a3a8542f079d908eb6f6ece0c30122fe52..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 126 zcmWIWW@Zs#00Eca#ISwL8ToubHVCr=aaw-9UP(oXPkvEqu0m!(acQoeo?d`ABa;XN gZcRXa42%p4APNYeMg(}XvVpjaKxhi2jX@j+0Fbs5I{*Lx diff --git a/tests/unit/utils/test_download.py b/tests/unit/utils/test_download.py index 99409e5d..2f2d993d 100644 --- a/tests/unit/utils/test_download.py +++ b/tests/unit/utils/test_download.py @@ -20,13 +20,13 @@ get_response_info, ) -from ...utils import assert_log_exists +from ...utils import MockTqdm, assert_log_exists Response = namedtuple("Response", ["status", "headers"]) DATA_DIR = Path(__file__).parents[2] / "data" / "downloader" -ZIP1 = DATA_DIR / "one-file.zip" -ZIP2 = DATA_DIR / "two-files.zip" +ZIP1 = DATA_DIR / "foo.zip" +ZIP2 = DATA_DIR / "foo-bar.zip" ZIP1_SIZE = ZIP1.stat().st_size ZIP2_SIZE = ZIP2.stat().st_size @@ -37,10 +37,6 @@ def temp_dir(): yield Path(tmp) -def make_dir(dirname: Path): - dirname.mkdir(parents=True) - - def test_progress_hook(): # Arrange progress_bar = MagicMock() @@ -111,7 +107,7 @@ def test_download_and_extract__overwrite(mock_download: MagicMock, temp_dir: Pat mock_download.return_value = src_file_path dst_dir_path = temp_dir / "data" - make_dir(dst_dir_path) + dst_dir_path.mkdir() # Act / Assert download_and_extract(url=MagicMock(), overwrite=False) @@ -150,7 +146,7 @@ def urlretrieve(*_args, **_kwargs): mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve - # Act / assert + # Act / Assert with structlog.testing.capture_logs() as capture: result = download(url=url, file_name=file_path, dir_path=dir_path) assert_log_exists(capture, "debug", "Downloading file") @@ -398,11 +394,61 @@ def test_get_download_path__missing_data(temp_dir: Path): def test_get_download_path__invalid_file_path(temp_dir: Path): # Arrange - make_dir(temp_dir / "download") + (temp_dir / "download").mkdir() + # Act / Assert with pytest.raises(ValueError, match=r"Invalid file path:"): get_download_path(dir_path=temp_dir, file_name="download") -def test_extract(temp_dir: Path): - extract +@patch("power_grid_model_io.utils.download.tqdm") +def test_extract(mock_tqdm: MagicMock, temp_dir: Path): + # Arrange + src_file_path = temp_dir / "compressed.zip" + dst_dir_path = temp_dir / "extracted" + shutil.copyfile(ZIP2, src_file_path) + mock_tqdm.side_effect = MockTqdm + + # Act + extract(src_file_path=src_file_path, dst_dir_path=dst_dir_path) + + # Assert + assert (dst_dir_path / "foo.txt").is_file() + assert (dst_dir_path / "folder/bar.txt").is_file() + + +@patch("power_grid_model_io.utils.download.tqdm") +def test_extract__file_exists(mock_tqdm: MagicMock, temp_dir: Path): + # Arrange + src_file_path = temp_dir / "compressed.zip" + dst_dir_path = temp_dir / "extracted" + shutil.copyfile(ZIP2, src_file_path) + mock_tqdm.side_effect = MockTqdm + + dst_dir_path.mkdir() + with open(dst_dir_path / "foo.txt", "wb") as fp: + fp.write(b"\0") + + # Act / Assert + with pytest.raises(FileExistsError, match=r"Destination file .*foo\.txt exists and is not empty"): + extract(src_file_path=src_file_path, dst_dir_path=dst_dir_path) + + +@patch("power_grid_model_io.utils.download.tqdm") +def test_extract__skip_if_exists(mock_tqdm: MagicMock, temp_dir: Path): + # Arrange + src_file_path = temp_dir / "compressed.zip" + dst_dir_path = temp_dir / "extracted" + shutil.copyfile(ZIP2, src_file_path) + mock_tqdm.side_effect = MockTqdm + + dst_dir_path.mkdir() + with open(dst_dir_path / "foo.txt", "wb") as fp: + fp.write(b"\0") + + # Act / Assert + with structlog.testing.capture_logs() as capture: + extract(src_file_path=src_file_path, dst_dir_path=dst_dir_path, skip_if_exists=True) + assert_log_exists( + capture, "debug", "Skip file extraction, destination file exists", dst_file_path=dst_dir_path / "foo.txt" + ) diff --git a/tests/utils.py b/tests/utils.py index 045686c7..e6cd4547 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -268,3 +268,16 @@ def sheet_names(self) -> List[str]: def parse(self, sheet_name: str, **_kwargs) -> pd.DataFrame: return self.data[sheet_name] + + + + +class MockTqdm: + """To use: for x in tqdm(iterable)""" + + def __init__(self, iterable=None, **kwargs): + self.iterable = iterable + + def __iter__(self): + for item in self.iterable: + yield item \ No newline at end of file From 6ae114bcf9e2daf6a7f69f3c2d256dd9547f3138 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Wed, 1 Mar 2023 13:52:03 +0100 Subject: [PATCH 43/50] Unit test for extract() Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 24 ++++++++++----- tests/unit/utils/test_download.py | 36 +++++++++++++++++++++-- 2 files changed, 51 insertions(+), 9 deletions(-) diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index 40c6db58..e8d40ead 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -224,13 +224,13 @@ def get_download_path( # Combine the two paths assert file_name is not None - file_path = (dir_path / file_name).resolve() if dir_path else Path(file_name).resolve() + file_path = (dir_path / file_name) if dir_path else Path(file_name) # If the file_path exists, it should be a file (not a dir) if file_path.exists() and not file_path.is_file(): raise ValueError(f"Invalid file path: {file_path}") - return file_path + return file_path.resolve() def extract(src_file_path: Path, dst_dir_path: Optional[Path] = None, skip_if_exists=False) -> Path: @@ -272,9 +272,19 @@ def extract(src_file_path: Path, dst_dir_path: Optional[Path] = None, skip_if_ex raise FileExistsError(f"Destination file {dst_dir_path / file_path} exists and is not empty") zip_file.extract(member=file_path, path=dst_dir_path) - # If the zip files contains a single directory with the same name as the zip file, return that dir - contents = list(dst_dir_path.iterdir()) - if len(contents) == 1 and contents[0].is_dir() and contents[0].name == src_file_path.stem: - dst_dir_path = contents[0] + # Zip files often contain a single directory with the same name as the zip file. + # In that case, return the dir to that directory instead of the root dir + only_item: Optional[Path] = None + for item in dst_dir_path.iterdir(): + # If only_item is None, this is the first iteration, so item may be the only item + if only_item is None: + only_item = item + # Else, if only_item is not None, there are more than one items in the root of the directory. + # This means hat there is no 'only_item' and we can stop the loop + else: + only_item = None + break + if only_item and only_item.is_dir() and only_item.name == src_file_path.stem: + dst_dir_path = only_item - return dst_dir_path + return dst_dir_path.resolve() diff --git a/tests/unit/utils/test_download.py b/tests/unit/utils/test_download.py index 2f2d993d..7b70353f 100644 --- a/tests/unit/utils/test_download.py +++ b/tests/unit/utils/test_download.py @@ -410,13 +410,30 @@ def test_extract(mock_tqdm: MagicMock, temp_dir: Path): mock_tqdm.side_effect = MockTqdm # Act - extract(src_file_path=src_file_path, dst_dir_path=dst_dir_path) + extract_dir_path = extract(src_file_path=src_file_path, dst_dir_path=dst_dir_path) # Assert + assert extract_dir_path == dst_dir_path assert (dst_dir_path / "foo.txt").is_file() assert (dst_dir_path / "folder/bar.txt").is_file() +@patch("power_grid_model_io.utils.download.tqdm") +def test_extract__auto_dir(mock_tqdm: MagicMock, temp_dir: Path): + # Arrange + src_file_path = temp_dir / "compressed.zip" + shutil.copyfile(ZIP2, src_file_path) + mock_tqdm.side_effect = MockTqdm + + # Act + extract_dir_path = extract(src_file_path=src_file_path) + + # Assert + assert extract_dir_path == temp_dir / "compressed" + assert (temp_dir / "compressed" / "foo.txt").is_file() + assert (temp_dir / "compressed" / "folder" / "bar.txt").is_file() + + @patch("power_grid_model_io.utils.download.tqdm") def test_extract__file_exists(mock_tqdm: MagicMock, temp_dir: Path): # Arrange @@ -438,7 +455,7 @@ def test_extract__file_exists(mock_tqdm: MagicMock, temp_dir: Path): def test_extract__skip_if_exists(mock_tqdm: MagicMock, temp_dir: Path): # Arrange src_file_path = temp_dir / "compressed.zip" - dst_dir_path = temp_dir / "extracted" + dst_dir_path = temp_dir / "compressed" shutil.copyfile(ZIP2, src_file_path) mock_tqdm.side_effect = MockTqdm @@ -452,3 +469,18 @@ def test_extract__skip_if_exists(mock_tqdm: MagicMock, temp_dir: Path): assert_log_exists( capture, "debug", "Skip file extraction, destination file exists", dst_file_path=dst_dir_path / "foo.txt" ) + + +@patch("power_grid_model_io.utils.download.tqdm") +def test_extract__return_subdir_path(mock_tqdm: MagicMock, temp_dir: Path): + # Arrange + src_file_path = temp_dir / "foo.zip" + shutil.copyfile(ZIP1, src_file_path) + mock_tqdm.side_effect = MockTqdm + + # Act + extract_dir_path = extract(src_file_path=src_file_path) + + # Assert + assert extract_dir_path == temp_dir / "foo" / "foo" + assert (temp_dir / "foo" / "foo" / "foo.txt").is_file() From b7d2cea068e19b1bd71f8edf16d4e89261112c4b Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Wed, 1 Mar 2023 14:49:47 +0100 Subject: [PATCH 44/50] Split download en zip_files util Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 76 +++---------- tests/data/downloader/foo-bar.zip | Bin 247 -> 0 bytes tests/data/downloader/foo-bar.zip.license | 3 - tests/data/downloader/foo.zip | Bin 134 -> 0 bytes tests/data/downloader/foo.zip.license | 3 - tests/unit/utils/test_download.py | 128 ++++------------------ 6 files changed, 33 insertions(+), 177 deletions(-) delete mode 100644 tests/data/downloader/foo-bar.zip delete mode 100644 tests/data/downloader/foo-bar.zip.license delete mode 100644 tests/data/downloader/foo.zip delete mode 100644 tests/data/downloader/foo.zip.license diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index e8d40ead..28912903 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -6,13 +6,19 @@ The most simple (and intended) usage is: url = "http://141.51.193.167/simbench/gui/usecase/download/?simbench_code=1-complete_data-mixed-all-0-sw&format=csv" -csv_dir = download_and_extract(url) +zip_file_path = download(url) It will download the zip file 1-complete_data-mixed-all-0-sw.zip to a folder in you systems temp dir; for example -"/tmp/1-complete_data-mixed-all-0-sw.zip". Then it extracts the files there as well, in a folder that corresponds to -the zip file name ("/tmp/1-complete_data-mixed-all-0-sw/" in our example), and it returns the path to that directory. -By default, it will not re-download or re-extract the zip file as long as the files exist in you temp dir. Your temp -dir is typically emptied whe you reboot your computer. +"/tmp/1-complete_data-mixed-all-0-sw.zip". + +Another convenience function is download_and_extract(): + +csv_dir_path = download_and_extract(url) + +This downloads the zip file as described above, and then it extracts the files there as well, in a folder which +corresponds to the zip file name ("/tmp/1-complete_data-mixed-all-0-sw/" in our example), and it returns the path to +that directory. By default, it will not re-download or re-extract the zip file as long as the files exist in your +temp dir. Your temp dir is typically emptied whe you reboot your computer. """ @@ -20,7 +26,6 @@ import hashlib import re import tempfile -import zipfile from dataclasses import dataclass from pathlib import Path from shutil import rmtree as remove_dir @@ -30,6 +35,8 @@ import structlog from tqdm import tqdm +from power_grid_model_io.utils.zip_files import extract + _log = structlog.get_logger(__name__) @@ -231,60 +238,3 @@ def get_download_path( raise ValueError(f"Invalid file path: {file_path}") return file_path.resolve() - - -def extract(src_file_path: Path, dst_dir_path: Optional[Path] = None, skip_if_exists=False) -> Path: - """ - Extract a .zip file and return the destination dir - - Args: - src_file_path: The .zip file to extract. - dst_dir_path: An optional destination path. If none is given, the src_file_path without .zip extension is used. - skip_if_exists: Skip existing files, otherwise raise an exception when a file exists. - - Returns: The path where the files are extracted - - """ - if src_file_path.suffix.lower() != ".zip": - raise ValueError(f"Only files with .zip extension are supported, got {src_file_path}") - - if dst_dir_path is None: - dst_dir_path = src_file_path.with_suffix("") - - log = _log.bind(src_file_path=src_file_path, dst_dir_path=dst_dir_path) - - if dst_dir_path.exists(): - if not dst_dir_path.is_dir(): - raise NotADirectoryError(f"Destination dir {dst_dir_path} exists and is not a directory") - - # Create the destination directory - dst_dir_path.mkdir(parents=True, exist_ok=True) - - # Extract per file, so we can show a progress bar - with zipfile.ZipFile(src_file_path, "r") as zip_file: - file_list = zip_file.namelist() - for file_path in tqdm(desc="Extracting", iterable=file_list, total=len(file_list), unit="file", leave=True): - dst_file_path = dst_dir_path / file_path - if dst_file_path.exists() and dst_file_path.stat().st_size > 0: - if skip_if_exists: - log.debug("Skip file extraction, destination file exists", dst_file_path=dst_file_path) - continue - raise FileExistsError(f"Destination file {dst_dir_path / file_path} exists and is not empty") - zip_file.extract(member=file_path, path=dst_dir_path) - - # Zip files often contain a single directory with the same name as the zip file. - # In that case, return the dir to that directory instead of the root dir - only_item: Optional[Path] = None - for item in dst_dir_path.iterdir(): - # If only_item is None, this is the first iteration, so item may be the only item - if only_item is None: - only_item = item - # Else, if only_item is not None, there are more than one items in the root of the directory. - # This means hat there is no 'only_item' and we can stop the loop - else: - only_item = None - break - if only_item and only_item.is_dir() and only_item.name == src_file_path.stem: - dst_dir_path = only_item - - return dst_dir_path.resolve() diff --git a/tests/data/downloader/foo-bar.zip b/tests/data/downloader/foo-bar.zip deleted file mode 100644 index 5f7c0598608bcbeb5f92ef6fa083c84039038b06..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 247 zcmWIWW@Zs#00FPy#IOdj7;`}&8-)3QI4wUXCACODDX~beq@qMmPcJ1uC%;IcII~0{ zF*mg&0It~usCgfAMm|Ur2(yDV=Yw?lU~k@aG;5Tpa5*A}c7Y<++?D;r3H2?z^;bODIN0079*E2jVe diff --git a/tests/data/downloader/foo-bar.zip.license b/tests/data/downloader/foo-bar.zip.license deleted file mode 100644 index e6220e93..00000000 --- a/tests/data/downloader/foo-bar.zip.license +++ /dev/null @@ -1,3 +0,0 @@ -SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project - -SPDX-License-Identifier: MPL-2.0 \ No newline at end of file diff --git a/tests/data/downloader/foo.zip b/tests/data/downloader/foo.zip deleted file mode 100644 index 8e13ab608434d4387c6fe15626003c35f659f68f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 134 zcmWIWW@Zs#00Eca#ISwL8ToubHVAVAaaw-9J`n4bRFwGS7p3MZWEK>c=IZI`1$Z+u ji7?>S2-M5K$e;kCfB?yu0B=?{kPssfS^{Zv5QhN(qP!L{ diff --git a/tests/data/downloader/foo.zip.license b/tests/data/downloader/foo.zip.license deleted file mode 100644 index e6220e93..00000000 --- a/tests/data/downloader/foo.zip.license +++ /dev/null @@ -1,3 +0,0 @@ -SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project - -SPDX-License-Identifier: MPL-2.0 \ No newline at end of file diff --git a/tests/unit/utils/test_download.py b/tests/unit/utils/test_download.py index 7b70353f..c4593aa7 100644 --- a/tests/unit/utils/test_download.py +++ b/tests/unit/utils/test_download.py @@ -1,7 +1,6 @@ # SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project # # SPDX-License-Identifier: MPL-2.0 -import shutil import tempfile from collections import namedtuple from pathlib import Path @@ -15,21 +14,14 @@ ResponseInfo, download, download_and_extract, - extract, get_download_path, get_response_info, ) -from ...utils import MockTqdm, assert_log_exists +from ...utils import assert_log_exists Response = namedtuple("Response", ["status", "headers"]) -DATA_DIR = Path(__file__).parents[2] / "data" / "downloader" -ZIP1 = DATA_DIR / "foo.zip" -ZIP2 = DATA_DIR / "foo-bar.zip" -ZIP1_SIZE = ZIP1.stat().st_size -ZIP2_SIZE = ZIP2.stat().st_size - @pytest.fixture() def temp_dir(): @@ -37,6 +29,11 @@ def temp_dir(): yield Path(tmp) +def make_file(file_path: Path, file_size: int = 0): + with open(file_path, "wb") as fp: + fp.write(b"\0" * file_size) + + def test_progress_hook(): # Arrange progress_bar = MagicMock() @@ -139,10 +136,10 @@ def test_download( download_path = temp_dir / "data.zip" def urlretrieve(*_args, **_kwargs): - shutil.copyfile(ZIP1, temp_file) + make_file(temp_file, 100) return temp_file, None - mock_info.return_value = ResponseInfo(status=200, file_size=ZIP1_SIZE, file_name="remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=100, file_name="remote.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -160,7 +157,7 @@ def urlretrieve(*_args, **_kwargs): mock_urlretrieve.assert_called_once_with(url, reporthook=mock_hook.return_value) assert result == file_path assert result.is_file() - assert result.stat().st_size == ZIP1_SIZE + assert result.stat().st_size == 100 @patch("power_grid_model_io.utils.download.DownloadProgressHook", new=MagicMock()) @@ -176,7 +173,7 @@ def test_download__auto_file_name( download_path = temp_dir / "data.zip" def urlretrieve(*_args, **_kwargs): - shutil.copyfile(ZIP1, temp_file) + make_file(temp_file, 100) return temp_file, None mock_info.return_value = ResponseInfo(status=200, file_size=None, file_name="remote.zip") @@ -228,9 +225,9 @@ def test_download__skip_existing_file( ): # Arrange download_path = temp_dir / "data.zip" - shutil.copyfile(ZIP1, download_path) + make_file(download_path, 100) - mock_info.return_value = ResponseInfo(status=200, file_size=ZIP1_SIZE, file_name="remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=100, file_name="remote.zip") mock_download_path.return_value = download_path # Act @@ -253,13 +250,13 @@ def test_download__update_file( # Arrange temp_file = temp_dir / "data.download" download_path = temp_dir / "data.zip" - shutil.copyfile(ZIP1, download_path) + make_file(download_path, 100) def urlretrieve(*_args, **_kwargs): - shutil.copyfile(ZIP2, temp_file) + make_file(temp_file, 101) return temp_file, None - mock_info.return_value = ResponseInfo(status=200, file_size=ZIP2_SIZE, file_name="remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=101, file_name="remote.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -271,7 +268,7 @@ def urlretrieve(*_args, **_kwargs): # Assert assert result == download_path assert result.is_file() - assert result.stat().st_size == ZIP2_SIZE + assert result.stat().st_size == 101 @patch("power_grid_model_io.utils.download.DownloadProgressHook", new=MagicMock()) @@ -285,13 +282,13 @@ def test_download__overwrite( # Arrange temp_file = temp_dir / "data.download" download_path = temp_dir / "data.zip" - shutil.copyfile(ZIP1, download_path) + make_file(download_path, 100) def urlretrieve(*_args, **_kwargs): - shutil.copyfile(ZIP1, temp_file) + make_file(temp_file, 100) return temp_file, None - mock_info.return_value = ResponseInfo(status=200, file_size=ZIP1_SIZE, file_name="remote.zip") + mock_info.return_value = ResponseInfo(status=200, file_size=100, file_name="remote.zip") mock_download_path.return_value = download_path mock_urlretrieve.side_effect = urlretrieve @@ -303,7 +300,7 @@ def urlretrieve(*_args, **_kwargs): # Assert assert result == download_path assert result.is_file() - assert result.stat().st_size == ZIP1_SIZE + assert result.stat().st_size == 100 @patch("power_grid_model_io.utils.download.get_response_info") @@ -399,88 +396,3 @@ def test_get_download_path__invalid_file_path(temp_dir: Path): # Act / Assert with pytest.raises(ValueError, match=r"Invalid file path:"): get_download_path(dir_path=temp_dir, file_name="download") - - -@patch("power_grid_model_io.utils.download.tqdm") -def test_extract(mock_tqdm: MagicMock, temp_dir: Path): - # Arrange - src_file_path = temp_dir / "compressed.zip" - dst_dir_path = temp_dir / "extracted" - shutil.copyfile(ZIP2, src_file_path) - mock_tqdm.side_effect = MockTqdm - - # Act - extract_dir_path = extract(src_file_path=src_file_path, dst_dir_path=dst_dir_path) - - # Assert - assert extract_dir_path == dst_dir_path - assert (dst_dir_path / "foo.txt").is_file() - assert (dst_dir_path / "folder/bar.txt").is_file() - - -@patch("power_grid_model_io.utils.download.tqdm") -def test_extract__auto_dir(mock_tqdm: MagicMock, temp_dir: Path): - # Arrange - src_file_path = temp_dir / "compressed.zip" - shutil.copyfile(ZIP2, src_file_path) - mock_tqdm.side_effect = MockTqdm - - # Act - extract_dir_path = extract(src_file_path=src_file_path) - - # Assert - assert extract_dir_path == temp_dir / "compressed" - assert (temp_dir / "compressed" / "foo.txt").is_file() - assert (temp_dir / "compressed" / "folder" / "bar.txt").is_file() - - -@patch("power_grid_model_io.utils.download.tqdm") -def test_extract__file_exists(mock_tqdm: MagicMock, temp_dir: Path): - # Arrange - src_file_path = temp_dir / "compressed.zip" - dst_dir_path = temp_dir / "extracted" - shutil.copyfile(ZIP2, src_file_path) - mock_tqdm.side_effect = MockTqdm - - dst_dir_path.mkdir() - with open(dst_dir_path / "foo.txt", "wb") as fp: - fp.write(b"\0") - - # Act / Assert - with pytest.raises(FileExistsError, match=r"Destination file .*foo\.txt exists and is not empty"): - extract(src_file_path=src_file_path, dst_dir_path=dst_dir_path) - - -@patch("power_grid_model_io.utils.download.tqdm") -def test_extract__skip_if_exists(mock_tqdm: MagicMock, temp_dir: Path): - # Arrange - src_file_path = temp_dir / "compressed.zip" - dst_dir_path = temp_dir / "compressed" - shutil.copyfile(ZIP2, src_file_path) - mock_tqdm.side_effect = MockTqdm - - dst_dir_path.mkdir() - with open(dst_dir_path / "foo.txt", "wb") as fp: - fp.write(b"\0") - - # Act / Assert - with structlog.testing.capture_logs() as capture: - extract(src_file_path=src_file_path, dst_dir_path=dst_dir_path, skip_if_exists=True) - assert_log_exists( - capture, "debug", "Skip file extraction, destination file exists", dst_file_path=dst_dir_path / "foo.txt" - ) - - -@patch("power_grid_model_io.utils.download.tqdm") -def test_extract__return_subdir_path(mock_tqdm: MagicMock, temp_dir: Path): - # Arrange - src_file_path = temp_dir / "foo.zip" - shutil.copyfile(ZIP1, src_file_path) - mock_tqdm.side_effect = MockTqdm - - # Act - extract_dir_path = extract(src_file_path=src_file_path) - - # Assert - assert extract_dir_path == temp_dir / "foo" / "foo" - assert (temp_dir / "foo" / "foo" / "foo.txt").is_file() From 6bbf4b7cc5604c97b8a2a4428a1141b030e3c967 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Wed, 1 Mar 2023 15:07:16 +0100 Subject: [PATCH 45/50] Split download en zip_files util Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/zip_files.py | 79 ++++++++++++++ tests/unit/utils/test_zip_files.py | 115 +++++++++++++++++++++ 2 files changed, 194 insertions(+) create mode 100644 src/power_grid_model_io/utils/zip_files.py create mode 100644 tests/unit/utils/test_zip_files.py diff --git a/src/power_grid_model_io/utils/zip_files.py b/src/power_grid_model_io/utils/zip_files.py new file mode 100644 index 00000000..6d5c45a5 --- /dev/null +++ b/src/power_grid_model_io/utils/zip_files.py @@ -0,0 +1,79 @@ +# SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project +# +# SPDX-License-Identifier: MPL-2.0 +""" +Helper function to extract zip files + +csv_dir_path = extract("/tmp/1-complete_data-mixed-all-0-sw.zip") + +This extracts the files, in a folder which corresponds to the zip file name ("/tmp/1-complete_data-mixed-all-0-sw/" in +our example), and it returns the path to that directory. By default, it will not re-download or re-extract the zip +file as long as the files exist. + +""" + +import zipfile +from pathlib import Path +from typing import Optional + +import structlog +from tqdm import tqdm + +_log = structlog.get_logger(__name__) + + +def extract(src_file_path: Path, dst_dir_path: Optional[Path] = None, skip_if_exists=False) -> Path: + """ + Extract a .zip file and return the destination dir + + Args: + src_file_path: The .zip file to extract. + dst_dir_path: An optional destination path. If none is given, the src_file_path without .zip extension is used. + skip_if_exists: Skip existing files, otherwise raise an exception when a file exists. + + Returns: The path where the files are extracted + + """ + if src_file_path.suffix.lower() != ".zip": + raise ValueError(f"Only files with .zip extension are supported, got {src_file_path.name}") + + if dst_dir_path is None: + dst_dir_path = src_file_path.with_suffix("") + + log = _log.bind(src_file_path=src_file_path, dst_dir_path=dst_dir_path) + + if dst_dir_path.exists(): + if not dst_dir_path.is_dir(): + raise NotADirectoryError(f"Destination dir {dst_dir_path} exists and is not a directory") + + # Create the destination directory + dst_dir_path.mkdir(parents=True, exist_ok=True) + + # Extract per file, so we can show a progress bar + with zipfile.ZipFile(src_file_path, "r") as zip_file: + file_list = zip_file.namelist() + for file_path in tqdm(desc="Extracting", iterable=file_list, total=len(file_list), unit="file", leave=True): + dst_file_path = dst_dir_path / file_path + if dst_file_path.exists() and dst_file_path.stat().st_size > 0: + if skip_if_exists: + log.debug("Skip file extraction, destination file exists", dst_file_path=dst_file_path) + continue + raise FileExistsError(f"Destination file {dst_dir_path / file_path} exists and is not empty") + zip_file.extract(member=file_path, path=dst_dir_path) + + # Zip files often contain a single directory with the same name as the zip file. + # In that case, return the dir to that directory instead of the root dir + only_item: Optional[Path] = None + for item in dst_dir_path.iterdir(): + # If only_item is None, this is the first iteration, so item may be the only item + if only_item is None: + only_item = item + # Else, if only_item is not None, there are more than one items in the root of the directory. + # This means hat there is no 'only_item' and we can stop the loop + else: + only_item = None + break + if only_item and only_item.is_dir() and only_item.name == src_file_path.stem: + dst_dir_path = only_item + + return dst_dir_path.resolve() diff --git a/tests/unit/utils/test_zip_files.py b/tests/unit/utils/test_zip_files.py new file mode 100644 index 00000000..e6b79c6b --- /dev/null +++ b/tests/unit/utils/test_zip_files.py @@ -0,0 +1,115 @@ +# SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project +# +# SPDX-License-Identifier: MPL-2.0 +import shutil +import tempfile +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest +import structlog.testing + +from power_grid_model_io.utils.zip_files import extract + +from ...utils import MockTqdm, assert_log_exists + +DATA_DIR = Path(__file__).parents[2] / "data" / "zip_files" +ZIP1 = DATA_DIR / "foo.zip" +ZIP2 = DATA_DIR / "foo-bar.zip" + + +@pytest.fixture() +def temp_dir(): + with tempfile.TemporaryDirectory() as tmp: + yield Path(tmp) + + +@patch("power_grid_model_io.utils.download.tqdm") +def test_extract(mock_tqdm: MagicMock, temp_dir: Path): + # Arrange + src_file_path = temp_dir / "compressed.zip" + dst_dir_path = temp_dir / "extracted" + shutil.copyfile(ZIP2, src_file_path) + mock_tqdm.side_effect = MockTqdm + + # Act + extract_dir_path = extract(src_file_path=src_file_path, dst_dir_path=dst_dir_path) + + # Assert + assert extract_dir_path == dst_dir_path + assert (dst_dir_path / "foo.txt").is_file() + assert (dst_dir_path / "folder/bar.txt").is_file() + + +@patch("power_grid_model_io.utils.download.tqdm") +def test_extract__auto_dir(mock_tqdm: MagicMock, temp_dir: Path): + # Arrange + src_file_path = temp_dir / "compressed.zip" + shutil.copyfile(ZIP2, src_file_path) + mock_tqdm.side_effect = MockTqdm + + # Act + extract_dir_path = extract(src_file_path=src_file_path) + + # Assert + assert extract_dir_path == temp_dir / "compressed" + assert (temp_dir / "compressed" / "foo.txt").is_file() + assert (temp_dir / "compressed" / "folder" / "bar.txt").is_file() + + +def test_extract__invalid_file_extension(): + # Act / Assert + with pytest.raises(ValueError, match=r"Only files with \.zip extension are supported, got tempfile\.download"): + extract(src_file_path=Path("/tmp/dir/tempfile.download")) + + +@patch("power_grid_model_io.utils.download.tqdm") +def test_extract__file_exists(mock_tqdm: MagicMock, temp_dir: Path): + # Arrange + src_file_path = temp_dir / "compressed.zip" + dst_dir_path = temp_dir / "extracted" + shutil.copyfile(ZIP2, src_file_path) + mock_tqdm.side_effect = MockTqdm + + dst_dir_path.mkdir() + with open(dst_dir_path / "foo.txt", "wb") as fp: + fp.write(b"\0") + + # Act / Assert + with pytest.raises(FileExistsError, match=r"Destination file .*foo\.txt exists and is not empty"): + extract(src_file_path=src_file_path, dst_dir_path=dst_dir_path) + + +@patch("power_grid_model_io.utils.download.tqdm") +def test_extract__skip_if_exists(mock_tqdm: MagicMock, temp_dir: Path): + # Arrange + src_file_path = temp_dir / "compressed.zip" + dst_dir_path = temp_dir / "compressed" + shutil.copyfile(ZIP2, src_file_path) + mock_tqdm.side_effect = MockTqdm + + dst_dir_path.mkdir() + with open(dst_dir_path / "foo.txt", "wb") as fp: + fp.write(b"\0") + + # Act / Assert + with structlog.testing.capture_logs() as capture: + extract(src_file_path=src_file_path, dst_dir_path=dst_dir_path, skip_if_exists=True) + assert_log_exists( + capture, "debug", "Skip file extraction, destination file exists", dst_file_path=dst_dir_path / "foo.txt" + ) + + +@patch("power_grid_model_io.utils.download.tqdm") +def test_extract__return_subdir_path(mock_tqdm: MagicMock, temp_dir: Path): + # Arrange + src_file_path = temp_dir / "foo.zip" + shutil.copyfile(ZIP1, src_file_path) + mock_tqdm.side_effect = MockTqdm + + # Act + extract_dir_path = extract(src_file_path=src_file_path) + + # Assert + assert extract_dir_path == temp_dir / "foo" / "foo" + assert (temp_dir / "foo" / "foo" / "foo.txt").is_file() From 48a98e023b2ac1bcf370379bdf4cbecfadf69659 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Wed, 1 Mar 2023 15:14:02 +0100 Subject: [PATCH 46/50] Final unit test for extract() Signed-off-by: Bram Stoeller --- tests/unit/utils/test_zip_files.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/unit/utils/test_zip_files.py b/tests/unit/utils/test_zip_files.py index e6b79c6b..0a651624 100644 --- a/tests/unit/utils/test_zip_files.py +++ b/tests/unit/utils/test_zip_files.py @@ -63,6 +63,16 @@ def test_extract__invalid_file_extension(): extract(src_file_path=Path("/tmp/dir/tempfile.download")) +def test_extract__invalid_dst_dir(temp_dir: Path): + # Arrange + with open(temp_dir / "notadir.txt", "wb"): + pass + + # Act / Assert + with pytest.raises(NotADirectoryError, match=r"Destination dir .*notadir\.txt exists and is not a directory"): + extract(src_file_path=Path("file.zip"), dst_dir_path=temp_dir / "notadir.txt") + + @patch("power_grid_model_io.utils.download.tqdm") def test_extract__file_exists(mock_tqdm: MagicMock, temp_dir: Path): # Arrange From 4de558239e522ba07cfcd7f273bb97b6fec3e0fb Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Wed, 1 Mar 2023 15:23:39 +0100 Subject: [PATCH 47/50] Add test .zip files Signed-off-by: Bram Stoeller --- tests/data/zip_files/foo-bar.zip | Bin 0 -> 247 bytes tests/data/zip_files/foo-bar.zip.license | 3 +++ tests/data/zip_files/foo.zip | Bin 0 -> 134 bytes tests/data/zip_files/foo.zip.license | 3 +++ 4 files changed, 6 insertions(+) create mode 100644 tests/data/zip_files/foo-bar.zip create mode 100644 tests/data/zip_files/foo-bar.zip.license create mode 100644 tests/data/zip_files/foo.zip create mode 100644 tests/data/zip_files/foo.zip.license diff --git a/tests/data/zip_files/foo-bar.zip b/tests/data/zip_files/foo-bar.zip new file mode 100644 index 0000000000000000000000000000000000000000..5f7c0598608bcbeb5f92ef6fa083c84039038b06 GIT binary patch literal 247 zcmWIWW@Zs#00FPy#IOdj7;`}&8-)3QI4wUXCACODDX~beq@qMmPcJ1uC%;IcII~0{ zF*mg&0It~usCgfAMm|Ur2(yDV=Yw?lU~k@aG;5Tpa5*A}c7Y<++?D;r3H2?z^;bODIN0079*E2jVe literal 0 HcmV?d00001 diff --git a/tests/data/zip_files/foo-bar.zip.license b/tests/data/zip_files/foo-bar.zip.license new file mode 100644 index 00000000..e6220e93 --- /dev/null +++ b/tests/data/zip_files/foo-bar.zip.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project + +SPDX-License-Identifier: MPL-2.0 \ No newline at end of file diff --git a/tests/data/zip_files/foo.zip b/tests/data/zip_files/foo.zip new file mode 100644 index 0000000000000000000000000000000000000000..8e13ab608434d4387c6fe15626003c35f659f68f GIT binary patch literal 134 zcmWIWW@Zs#00Eca#ISwL8ToubHVAVAaaw-9J`n4bRFwGS7p3MZWEK>c=IZI`1$Z+u ji7?>S2-M5K$e;kCfB?yu0B=?{kPssfS^{Zv5QhN(qP!L{ literal 0 HcmV?d00001 diff --git a/tests/data/zip_files/foo.zip.license b/tests/data/zip_files/foo.zip.license new file mode 100644 index 00000000..e6220e93 --- /dev/null +++ b/tests/data/zip_files/foo.zip.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project + +SPDX-License-Identifier: MPL-2.0 \ No newline at end of file From 44617a360024f33ac44e16ad540a3a51a49b4877 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Wed, 1 Mar 2023 15:28:09 +0100 Subject: [PATCH 48/50] The project is called Power Grid Model Signed-off-by: Bram Stoeller --- src/power_grid_model_io/utils/download.py | 2 +- src/power_grid_model_io/utils/zip_files.py | 2 +- tests/data/zip_files/foo-bar.zip.license | 2 +- tests/data/zip_files/foo.zip.license | 2 +- tests/unit/utils/test_download.py | 2 +- tests/unit/utils/test_zip_files.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/power_grid_model_io/utils/download.py b/src/power_grid_model_io/utils/download.py index 28912903..6e5db493 100644 --- a/src/power_grid_model_io/utils/download.py +++ b/src/power_grid_model_io/utils/download.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project +# SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model project # # SPDX-License-Identifier: MPL-2.0 """ diff --git a/src/power_grid_model_io/utils/zip_files.py b/src/power_grid_model_io/utils/zip_files.py index 6d5c45a5..cf001a83 100644 --- a/src/power_grid_model_io/utils/zip_files.py +++ b/src/power_grid_model_io/utils/zip_files.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project +# SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model project # # SPDX-License-Identifier: MPL-2.0 """ diff --git a/tests/data/zip_files/foo-bar.zip.license b/tests/data/zip_files/foo-bar.zip.license index e6220e93..ebe16ce8 100644 --- a/tests/data/zip_files/foo-bar.zip.license +++ b/tests/data/zip_files/foo-bar.zip.license @@ -1,3 +1,3 @@ -SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project +SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model project SPDX-License-Identifier: MPL-2.0 \ No newline at end of file diff --git a/tests/data/zip_files/foo.zip.license b/tests/data/zip_files/foo.zip.license index e6220e93..ebe16ce8 100644 --- a/tests/data/zip_files/foo.zip.license +++ b/tests/data/zip_files/foo.zip.license @@ -1,3 +1,3 @@ -SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project +SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model project SPDX-License-Identifier: MPL-2.0 \ No newline at end of file diff --git a/tests/unit/utils/test_download.py b/tests/unit/utils/test_download.py index c4593aa7..ecca974a 100644 --- a/tests/unit/utils/test_download.py +++ b/tests/unit/utils/test_download.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project +# SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model project # # SPDX-License-Identifier: MPL-2.0 import tempfile diff --git a/tests/unit/utils/test_zip_files.py b/tests/unit/utils/test_zip_files.py index 0a651624..5f745e3a 100644 --- a/tests/unit/utils/test_zip_files.py +++ b/tests/unit/utils/test_zip_files.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model IO project +# SPDX-FileCopyrightText: 2022 Contributors to the Power Grid Model project # # SPDX-License-Identifier: MPL-2.0 import shutil From b4c37c0d53d5e22f0836d29704b12b30eaefa735 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Wed, 1 Mar 2023 16:10:33 +0100 Subject: [PATCH 49/50] Resolve temp dir in unit tests Signed-off-by: Bram Stoeller --- tests/unit/utils/test_download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/utils/test_download.py b/tests/unit/utils/test_download.py index ecca974a..a52e3f3e 100644 --- a/tests/unit/utils/test_download.py +++ b/tests/unit/utils/test_download.py @@ -26,7 +26,7 @@ @pytest.fixture() def temp_dir(): with tempfile.TemporaryDirectory() as tmp: - yield Path(tmp) + yield Path(tmp).resolve() def make_file(file_path: Path, file_size: int = 0): From 5f8f32a6ec04d5ee57128fbc6f06568fba4a0f36 Mon Sep 17 00:00:00 2001 From: Bram Stoeller Date: Wed, 1 Mar 2023 16:18:20 +0100 Subject: [PATCH 50/50] Resolve temp dir in unit tests Signed-off-by: Bram Stoeller --- tests/unit/utils/test_download.py | 2 +- tests/unit/utils/test_zip_files.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/utils/test_download.py b/tests/unit/utils/test_download.py index a52e3f3e..7e886703 100644 --- a/tests/unit/utils/test_download.py +++ b/tests/unit/utils/test_download.py @@ -367,7 +367,7 @@ def test_get_download_path__auto_dir(): path = get_download_path(file_name="file_name.zip") # Assert - assert path == Path(tempfile.gettempdir()).absolute() / "file_name.zip" + assert path == Path(tempfile.gettempdir()).resolve() / "file_name.zip" def test_get_download_path__auto_file_name(temp_dir: Path): diff --git a/tests/unit/utils/test_zip_files.py b/tests/unit/utils/test_zip_files.py index 5f745e3a..7c9d7e35 100644 --- a/tests/unit/utils/test_zip_files.py +++ b/tests/unit/utils/test_zip_files.py @@ -21,7 +21,7 @@ @pytest.fixture() def temp_dir(): with tempfile.TemporaryDirectory() as tmp: - yield Path(tmp) + yield Path(tmp).resolve() @patch("power_grid_model_io.utils.download.tqdm")