From b196f5ad56f9daaa32de7af4209a562941523a2a Mon Sep 17 00:00:00 2001 From: "Rui Ying (Mac)" <38958822+ruiying-ocean@users.noreply.github.com> Date: Thu, 19 Sep 2024 13:57:51 +0100 Subject: [PATCH] v0.13.5 add extra dependency pangaeapy --- pyproject.toml | 5 ++- setup.cfg | 8 +++- src/cgeniepy/table.py | 101 ++++++++++++++++++++++++++---------------- 3 files changed, 73 insertions(+), 41 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6f0e600..1e33a9d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,6 +4,9 @@ build-backend = "setuptools.build_meta" [project] name = "cgeniepy" -version = "0.13.3" +version = "0.13.5" description = "A Python package to read, analyse and visualise cGENIE Earth System Model output" license = {text = "GPL-3.0-only"} + +[project.optional-dependencies] +extra = ["pangaeapy"] diff --git a/setup.cfg b/setup.cfg index 9bfb4ee..fe66f8a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,8 +1,8 @@ [metadata] name = cgeniepy -version = 0.13.3 +version = 0.13.5 author = Rui Ying -author_email = rui.ying@bristol.ac.uk +author_email = ying.rui@outlook.com description = A Python package to read, analyse and visualise cGENIE Earth System Model output long_description = file: README.md long_description_content_type = text/markdown @@ -33,3 +33,7 @@ where = src [options.package_data] * = *.nc, *.csv, *.txt, *.xml + +[options.extras_require] +extra = pangaeapy + diff --git a/src/cgeniepy/table.py b/src/cgeniepy/table.py index 68ca883..66e9e70 100644 --- a/src/cgeniepy/table.py +++ b/src/cgeniepy/table.py @@ -5,6 +5,8 @@ import pandas as pd from io import StringIO +import re +from typing import Union from cgeniepy.grid import Interpolator, GridOperation @@ -12,41 +14,64 @@ from cgeniepy.grid import GridOperation import cgeniepy.array as ca from importlib.resources import files - + class ScatterData: - """ScatterData is a class to store non-gridded data with columns of coordinates. - """ + """ScatterData is a class to store non-gridded data with columns of coordinates.""" - def __init__(self, data, mutable=False, *args, **kwargs): + def __init__(self, data: Union[pd.DataFrame, int, str], mutable: bool = False, **kwargs): """ Initialize a ScatterData object. Parameters: - data: The path to the file or the data. - coord_cols (dict): A dictionary specifying the coordinate columns. + data: The path to the file, the data, or a PanDataSet ID. + mutable: Whether the data is mutable. + **kwargs: Additional keyword arguments for pandas read functions. """ - ## if already a dataframe - if isinstance(data, pd.DataFrame): - self.data = data - self.mutable = mutable + self.data = self._process_data(data, **kwargs) - ## if a file path then read the file into a dataframe - if isinstance(data, str): - if data.endswith(".tab"): - data = self._parse_tab_file(data) - self.data= pd.read_csv(StringIO(data), *args, **kwargs) - elif data.endswith("xlsx"): - self.data = pd.read_excel(data, *args, **kwargs) - else: - self.data = pd.read_csv(data, *args, **kwargs) - - ## if the index is already set in the data, then set the coordinates + # if the index is already set in the data, then set the coordinates if not isinstance(self.data.index, pd.core.indexes.range.RangeIndex): self.index= list(self.data.index.names) GridOperation().set_coordinates(obj=self, index=self.index) + def _process_data(self, data: Union[pd.DataFrame, int, str], **kwargs) -> pd.DataFrame: + if isinstance(data, pd.DataFrame): + return data + if isinstance(data, int): + try: + from pangaeapy.pandataset import PanDataSet + return PanDataSet(data).data + except ImportError: + print("Unable to import PanDataSet from pangaeapy. Please make sure the package is installed.") + if isinstance(data, str): + return self._process_string_data(data, **kwargs) + raise ValueError("Unsupported data type. Expected DataFrame, int, or str.") + + def _process_string_data(self, data: str, **kwargs) -> pd.DataFrame: + if data.endswith(".tab"): + return pd.read_csv(StringIO(self._parse_tab_file(data)), **kwargs) + if data.endswith(".xlsx"): + return pd.read_excel(data, **kwargs) + if "PANGAEA" in data: + try: + from pangaeapy.pandataset import PanDataSet + except ImportError: + print("Unable to import PanDataSet from pangaeapy. Please make sure the package is installed.") + + doi = self._extract_doi(data) + return PanDataSet(doi).data + return pd.read_csv(data, **kwargs) + + @staticmethod + def _extract_doi(url: str) -> str: + doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Z0-9]+" + doi_match = re.search(doi_pattern, url, re.IGNORECASE) + if doi_match: + return doi_match.group(0) + raise ValueError("No valid DOI found in the URL.") + def __repr__(self): prefix = "ScatterData\n" columns = f"Columns: {self.data.columns}\n" @@ -58,22 +83,6 @@ def __repr__(self): return prefix + columns + index + rows - - def set_index(self, index): - """Tell the object which columns are the coordinates. - """ - self.data.set_index(index, inplace=True) - self.index= index - GridOperation().set_coordinates(obj=self, index=self.index) - - - def reset_index(self): - if self.mutable: - self.data = self.data.reset_index() - return self - else: - return self.data.reset_index() - def _parse_tab_file(self, filename, begin_cmt = '/*', end_cmt = '*/'): """ Read a tab-delimited file and return a pandas that is optimised for pangea-format data. @@ -101,7 +110,7 @@ def _parse_tab_file(self, filename, begin_cmt = '/*', end_cmt = '*/'): lines.append(line.rstrip('\n')) data = '\n'.join(lines) - return data + return data def __getitem__(self, item): return self.data[item] @@ -118,6 +127,22 @@ def _check_cols(self, cols): if col not in self.data.columns: raise ValueError(f"{col} not found in the dataframe") + + def set_index(self, index): + """Tell the object which columns are the coordinates. + """ + self.data.set_index(index, inplace=True) + self.index= index + GridOperation().set_coordinates(obj=self, index=self.index) + + + def reset_index(self): + if self.mutable: + self.data = self.data.reset_index() + return self + else: + return self.data.reset_index() + def detect_basin(self): """use point-in-polygon strategy to detect modern ocean basin according to lon/lat column