From 175ba3c7cc5c5126f015ef7e7a1ba95b6ad8dacc Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Thu, 21 Mar 2024 13:29:36 +0800 Subject: [PATCH 1/8] GMT_DATASET: Return an empty DataFrame if the file has no data --- pygmt/datatypes/dataset.py | 10 ++-- pygmt/tests/test_datatypes_dataset.py | 83 +++++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 4 deletions(-) create mode 100644 pygmt/tests/test_datatypes_dataset.py diff --git a/pygmt/datatypes/dataset.py b/pygmt/datatypes/dataset.py index a0d0547f3ca..4828afb6064 100644 --- a/pygmt/datatypes/dataset.py +++ b/pygmt/datatypes/dataset.py @@ -13,8 +13,8 @@ class _GMT_DATASET(ctp.Structure): # noqa: N801 """ GMT dataset structure for holding multiple tables (files). - This class is only meant for internal use by PyGMT and is not exposed to users. - See the GMT source code gmt_resources.h for the original C struct definitions. + This class is only meant for internal use and is not exposed to users. See the GMT + source code ``gmt_resources.h`` for the original C struct definitions. Examples -------- @@ -151,6 +151,8 @@ def to_dataframe(self) -> pd.DataFrame: the same. The same column in all segments of all tables are concatenated. The trailing text column is also concatenated as a single string column. + If the object has no data, an empty DataFrame will be returned. + Returns ------- df @@ -185,8 +187,8 @@ def to_dataframe(self) -> pd.DataFrame: >>> df.dtypes.to_list() [dtype('float64'), dtype('float64'), dtype('float64'), string[python]] """ - # Deal with numeric columns vectors = [] + # Deal with numeric columns for icol in range(self.n_columns): colvector = [] for itbl in range(self.n_tables): @@ -211,5 +213,5 @@ def to_dataframe(self) -> pd.DataFrame: pd.Series(data=np.char.decode(textvector), dtype=pd.StringDtype()) ) - df = pd.concat(objs=vectors, axis=1) + df = pd.concat(objs=vectors, axis=1) if vectors else pd.DataFrame() return df diff --git a/pygmt/tests/test_datatypes_dataset.py b/pygmt/tests/test_datatypes_dataset.py new file mode 100644 index 00000000000..c349ba413dc --- /dev/null +++ b/pygmt/tests/test_datatypes_dataset.py @@ -0,0 +1,83 @@ +""" +Tests for GMT_DATASET data type. +""" + +from pathlib import Path + +import pandas as pd +import pytest +from pygmt.clib import Session +from pygmt.helpers import GMTTempFile + + +def dataframe_from_pandas(filepath_or_buffer, sep=r"\s+", comment="#", header=None): + """ + Read a tabular data as pandas.DataFrame object using pandas.read_csv(). + + The parameters have the same meaning as in ``pandas.read_csv()``. + """ + try: + df = pd.read_csv(filepath_or_buffer, sep=sep, comment=comment, header=header) + except pd.errors.EmptyDataError: + # Return an empty DataFrame if the file has no data + return pd.DataFrame() + + # By default, pandas reads text strings with whitespaces as multiple columns, but + # GMT contacatenates all trailing text as a single string column. Neet do find all + # string columns (with dtype="object") and combine them into a single string column. + string_columns = df.select_dtypes(include=["object"]).columns + if len(string_columns) > 1: + df[string_columns[0]] = df[string_columns].apply(lambda x: " ".join(x), axis=1) + df = df.drop(string_columns[1:], axis=1) + # Convert 'object' to 'string' type + df = df.convert_dtypes( + convert_string=True, + convert_integer=False, + convert_boolean=False, + convert_floating=False, + ) + return df + + +def dataframe_from_gmt(fname): + """ + Read a tabular data as pandas.DataFrame using GMT virtual file. + """ + with Session() as lib: + with lib.virtualfile_out(kind="dataset") as vouttbl: + lib.call_module("read", f"{fname} {vouttbl} -Td") + df = lib.virtualfile_to_dataset(vfname=vouttbl) + return df + + +@pytest.mark.benchmark +def test_dataset(): + """ + Test the basic functionality of GMT_DATASET. + """ + with GMTTempFile(suffix=".txt") as tmpfile: + with Path(tmpfile.name).open(mode="w") as fp: + print(">", file=fp) + print("1.0 2.0 3.0 TEXT1 TEXT23", file=fp) + print("4.0 5.0 6.0 TEXT4 TEXT567", file=fp) + print(">", file=fp) + print("7.0 8.0 9.0 TEXT8 TEXT90", file=fp) + print("10.0 11.0 12.0 TEXT123 TEXT456789", file=fp) + + df = dataframe_from_gmt(tmpfile.name) + expected_df = dataframe_from_pandas(tmpfile.name, comment=">") + pd.testing.assert_frame_equal(df, expected_df) + + +def test_dataset_empty(): + """ + Make sure that an empty DataFrame is returned if a file has no data. + """ + with GMTTempFile(suffix=".txt") as tmpfile: + with Path(tmpfile.name).open(mode="w") as fp: + print("# This is a comment line.", file=fp) + + df = dataframe_from_gmt(tmpfile.name) + assert df.empty # Empty DataFrame + expected_df = dataframe_from_pandas(tmpfile.name) + pd.testing.assert_frame_equal(df, expected_df) From 2e6e2772e12cf082d87318d6fbad6b6efd98878e Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Thu, 21 Mar 2024 19:01:15 +0800 Subject: [PATCH 2/8] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Yvonne Fröhlich <94163266+yvonnefroehlich@users.noreply.github.com> --- pygmt/tests/test_datatypes_dataset.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pygmt/tests/test_datatypes_dataset.py b/pygmt/tests/test_datatypes_dataset.py index c349ba413dc..7861b6b3119 100644 --- a/pygmt/tests/test_datatypes_dataset.py +++ b/pygmt/tests/test_datatypes_dataset.py @@ -12,18 +12,18 @@ def dataframe_from_pandas(filepath_or_buffer, sep=r"\s+", comment="#", header=None): """ - Read a tabular data as pandas.DataFrame object using pandas.read_csv(). + Read tabular data as pandas.DataFrame object using pandas.read_csv(). The parameters have the same meaning as in ``pandas.read_csv()``. """ try: df = pd.read_csv(filepath_or_buffer, sep=sep, comment=comment, header=header) except pd.errors.EmptyDataError: - # Return an empty DataFrame if the file has no data + # Return an empty DataFrame if the file contains no data return pd.DataFrame() # By default, pandas reads text strings with whitespaces as multiple columns, but - # GMT contacatenates all trailing text as a single string column. Neet do find all + # GMT concatenates all trailing text as a single string column. Need do find all # string columns (with dtype="object") and combine them into a single string column. string_columns = df.select_dtypes(include=["object"]).columns if len(string_columns) > 1: @@ -41,7 +41,7 @@ def dataframe_from_pandas(filepath_or_buffer, sep=r"\s+", comment="#", header=No def dataframe_from_gmt(fname): """ - Read a tabular data as pandas.DataFrame using GMT virtual file. + Read tabular data as pandas.DataFrame using GMT virtual file. """ with Session() as lib: with lib.virtualfile_out(kind="dataset") as vouttbl: @@ -71,7 +71,7 @@ def test_dataset(): def test_dataset_empty(): """ - Make sure that an empty DataFrame is returned if a file has no data. + Make sure that an empty DataFrame is returned if a file contains no data. """ with GMTTempFile(suffix=".txt") as tmpfile: with Path(tmpfile.name).open(mode="w") as fp: From a2c48d5486c88d8eb276942f037623de8abed84e Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Wed, 27 Mar 2024 10:31:25 +0800 Subject: [PATCH 3/8] Fixes --- pygmt/datatypes/dataset.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pygmt/datatypes/dataset.py b/pygmt/datatypes/dataset.py index f6bde88dc3b..d4aca2dbbf5 100644 --- a/pygmt/datatypes/dataset.py +++ b/pygmt/datatypes/dataset.py @@ -156,9 +156,7 @@ def to_dataframe( the same. The same column in all segments of all tables are concatenated. The trailing text column is also concatenated as a single string column. - <<<<<<< HEAD - If the object has no data, an empty DataFrame will be returned. - ======= + If the object contains no data, an empty DataFrame will be returned. Parameters ---------- @@ -169,7 +167,6 @@ def to_dataframe( column names to types. index_col Column to set as index. - >>>>>>> main Returns ------- From 1281ec0e454ba014e0d6e79adb63078122aead02 Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Wed, 27 Mar 2024 13:47:48 +0800 Subject: [PATCH 4/8] Add more comments --- pygmt/datatypes/dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pygmt/datatypes/dataset.py b/pygmt/datatypes/dataset.py index d4aca2dbbf5..dd6aba66170 100644 --- a/pygmt/datatypes/dataset.py +++ b/pygmt/datatypes/dataset.py @@ -234,10 +234,10 @@ def to_dataframe( # Create a DataFrame object by concatenating multiple columns df = pd.concat(objs=vectors, axis="columns") - if column_names is not None: + if column_names is not None: # Assign column names df.columns = column_names - if dtype is not None: + if dtype is not None: # Set dtype for the whole dataset or individual columns df = df.astype(dtype) - if index_col is not None: + if index_col is not None: # Use a specific column as index df = df.set_index(index_col) return df From 71cc9b7d8c5b3c9ae9c0765d501e1cc5d79687a0 Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Fri, 29 Mar 2024 09:46:01 +0800 Subject: [PATCH 5/8] Return an empty DataFrame with column names Co-authored-by: Wei Ji <23487320+weiji14@users.noreply.github.com> --- pygmt/datatypes/dataset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pygmt/datatypes/dataset.py b/pygmt/datatypes/dataset.py index dd6aba66170..a3c2f65c7bf 100644 --- a/pygmt/datatypes/dataset.py +++ b/pygmt/datatypes/dataset.py @@ -230,10 +230,10 @@ def to_dataframe( # Return an empty DataFrame if no columns are found. if len(vectors) == 0: - return pd.DataFrame() - - # Create a DataFrame object by concatenating multiple columns - df = pd.concat(objs=vectors, axis="columns") + df = pd.DataFrame(columns=column_names) + else: + # Create a DataFrame object by concatenating multiple columns + df = pd.concat(objs=vectors, axis="columns") if column_names is not None: # Assign column names df.columns = column_names if dtype is not None: # Set dtype for the whole dataset or individual columns From 065ec12492018e8fac6839a4db3dfbcbaa618612 Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Fri, 29 Mar 2024 09:57:48 +0800 Subject: [PATCH 6/8] Do not assign column names again for empty DataFrame --- pygmt/datatypes/dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pygmt/datatypes/dataset.py b/pygmt/datatypes/dataset.py index a3c2f65c7bf..6b45e001556 100644 --- a/pygmt/datatypes/dataset.py +++ b/pygmt/datatypes/dataset.py @@ -234,8 +234,8 @@ def to_dataframe( else: # Create a DataFrame object by concatenating multiple columns df = pd.concat(objs=vectors, axis="columns") - if column_names is not None: # Assign column names - df.columns = column_names + if column_names is not None: # Assign column names + df.columns = column_names if dtype is not None: # Set dtype for the whole dataset or individual columns df = df.astype(dtype) if index_col is not None: # Use a specific column as index From dbfc2aead80724f723780ca903b68368740b2eb0 Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Fri, 29 Mar 2024 11:26:40 +0800 Subject: [PATCH 7/8] Improve type hints --- pygmt/datatypes/dataset.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pygmt/datatypes/dataset.py b/pygmt/datatypes/dataset.py index 6b45e001556..ae402270cb9 100644 --- a/pygmt/datatypes/dataset.py +++ b/pygmt/datatypes/dataset.py @@ -3,7 +3,8 @@ """ import ctypes as ctp -from typing import ClassVar +from collections.abc import Mapping +from typing import Any, ClassVar import numpy as np import pandas as pd @@ -145,8 +146,8 @@ class _GMT_DATASEGMENT(ctp.Structure): # noqa: N801 def to_dataframe( self, - column_names: list[str] | None = None, - dtype: type | dict[str, type] | None = None, + column_names: pd.Index | None = None, + dtype: type | Mapping[Any, type] | None = None, index_col: str | int | None = None, ) -> pd.DataFrame: """ From 06790e280659d6e700a2b64a3b360f95109baf43 Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Fri, 29 Mar 2024 12:06:26 +0800 Subject: [PATCH 8/8] Apply suggestions from code review Co-authored-by: Wei Ji <23487320+weiji14@users.noreply.github.com> --- pygmt/datatypes/dataset.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pygmt/datatypes/dataset.py b/pygmt/datatypes/dataset.py index ae402270cb9..7a61b7f3d91 100644 --- a/pygmt/datatypes/dataset.py +++ b/pygmt/datatypes/dataset.py @@ -157,7 +157,8 @@ def to_dataframe( the same. The same column in all segments of all tables are concatenated. The trailing text column is also concatenated as a single string column. - If the object contains no data, an empty DataFrame will be returned. + If the object contains no data, an empty DataFrame will be returned (with the + column names and dtypes set if provided). Parameters ---------- @@ -229,8 +230,8 @@ def to_dataframe( pd.Series(data=np.char.decode(textvector), dtype=pd.StringDtype()) ) - # Return an empty DataFrame if no columns are found. if len(vectors) == 0: + # Return an empty DataFrame if no columns are found. df = pd.DataFrame(columns=column_names) else: # Create a DataFrame object by concatenating multiple columns