From 175ba3c7cc5c5126f015ef7e7a1ba95b6ad8dacc Mon Sep 17 00:00:00 2001
From: Dongdong Tian <seisman.info@gmail.com>
Date: Thu, 21 Mar 2024 13:29:36 +0800
Subject: [PATCH 1/8] GMT_DATASET: Return an empty DataFrame if the file has no
 data

---
 pygmt/datatypes/dataset.py            | 10 ++--
 pygmt/tests/test_datatypes_dataset.py | 83 +++++++++++++++++++++++++++
 2 files changed, 89 insertions(+), 4 deletions(-)
 create mode 100644 pygmt/tests/test_datatypes_dataset.py

diff --git a/pygmt/datatypes/dataset.py b/pygmt/datatypes/dataset.py
index a0d0547f3ca..4828afb6064 100644
--- a/pygmt/datatypes/dataset.py
+++ b/pygmt/datatypes/dataset.py
@@ -13,8 +13,8 @@ class _GMT_DATASET(ctp.Structure):  # noqa: N801
     """
     GMT dataset structure for holding multiple tables (files).
 
-    This class is only meant for internal use by PyGMT and is not exposed to users.
-    See the GMT source code gmt_resources.h for the original C struct definitions.
+    This class is only meant for internal use and is not exposed to users. See the GMT
+    source code ``gmt_resources.h`` for the original C struct definitions.
 
     Examples
     --------
@@ -151,6 +151,8 @@ def to_dataframe(self) -> pd.DataFrame:
         the same. The same column in all segments of all tables are concatenated. The
         trailing text column is also concatenated as a single string column.
 
+        If the object has no data, an empty DataFrame will be returned.
+
         Returns
         -------
         df
@@ -185,8 +187,8 @@ def to_dataframe(self) -> pd.DataFrame:
         >>> df.dtypes.to_list()
         [dtype('float64'), dtype('float64'), dtype('float64'), string[python]]
         """
-        # Deal with numeric columns
         vectors = []
+        # Deal with numeric columns
         for icol in range(self.n_columns):
             colvector = []
             for itbl in range(self.n_tables):
@@ -211,5 +213,5 @@ def to_dataframe(self) -> pd.DataFrame:
                 pd.Series(data=np.char.decode(textvector), dtype=pd.StringDtype())
             )
 
-        df = pd.concat(objs=vectors, axis=1)
+        df = pd.concat(objs=vectors, axis=1) if vectors else pd.DataFrame()
         return df
diff --git a/pygmt/tests/test_datatypes_dataset.py b/pygmt/tests/test_datatypes_dataset.py
new file mode 100644
index 00000000000..c349ba413dc
--- /dev/null
+++ b/pygmt/tests/test_datatypes_dataset.py
@@ -0,0 +1,83 @@
+"""
+Tests for GMT_DATASET data type.
+"""
+
+from pathlib import Path
+
+import pandas as pd
+import pytest
+from pygmt.clib import Session
+from pygmt.helpers import GMTTempFile
+
+
+def dataframe_from_pandas(filepath_or_buffer, sep=r"\s+", comment="#", header=None):
+    """
+    Read a tabular data as pandas.DataFrame object using pandas.read_csv().
+
+    The parameters have the same meaning as in ``pandas.read_csv()``.
+    """
+    try:
+        df = pd.read_csv(filepath_or_buffer, sep=sep, comment=comment, header=header)
+    except pd.errors.EmptyDataError:
+        # Return an empty DataFrame if the file has no data
+        return pd.DataFrame()
+
+    # By default, pandas reads text strings with whitespaces as multiple columns, but
+    # GMT contacatenates all trailing text as a single string column. Neet do find all
+    # string columns (with dtype="object") and combine them into a single string column.
+    string_columns = df.select_dtypes(include=["object"]).columns
+    if len(string_columns) > 1:
+        df[string_columns[0]] = df[string_columns].apply(lambda x: " ".join(x), axis=1)
+        df = df.drop(string_columns[1:], axis=1)
+    # Convert 'object' to 'string' type
+    df = df.convert_dtypes(
+        convert_string=True,
+        convert_integer=False,
+        convert_boolean=False,
+        convert_floating=False,
+    )
+    return df
+
+
+def dataframe_from_gmt(fname):
+    """
+    Read a tabular data as pandas.DataFrame using GMT virtual file.
+    """
+    with Session() as lib:
+        with lib.virtualfile_out(kind="dataset") as vouttbl:
+            lib.call_module("read", f"{fname} {vouttbl} -Td")
+            df = lib.virtualfile_to_dataset(vfname=vouttbl)
+            return df
+
+
+@pytest.mark.benchmark
+def test_dataset():
+    """
+    Test the basic functionality of GMT_DATASET.
+    """
+    with GMTTempFile(suffix=".txt") as tmpfile:
+        with Path(tmpfile.name).open(mode="w") as fp:
+            print(">", file=fp)
+            print("1.0 2.0 3.0 TEXT1 TEXT23", file=fp)
+            print("4.0 5.0 6.0 TEXT4 TEXT567", file=fp)
+            print(">", file=fp)
+            print("7.0 8.0 9.0 TEXT8 TEXT90", file=fp)
+            print("10.0 11.0 12.0 TEXT123 TEXT456789", file=fp)
+
+        df = dataframe_from_gmt(tmpfile.name)
+        expected_df = dataframe_from_pandas(tmpfile.name, comment=">")
+        pd.testing.assert_frame_equal(df, expected_df)
+
+
+def test_dataset_empty():
+    """
+    Make sure that an empty DataFrame is returned if a file has no data.
+    """
+    with GMTTempFile(suffix=".txt") as tmpfile:
+        with Path(tmpfile.name).open(mode="w") as fp:
+            print("# This is a comment line.", file=fp)
+
+        df = dataframe_from_gmt(tmpfile.name)
+        assert df.empty  # Empty DataFrame
+        expected_df = dataframe_from_pandas(tmpfile.name)
+        pd.testing.assert_frame_equal(df, expected_df)

From 2e6e2772e12cf082d87318d6fbad6b6efd98878e Mon Sep 17 00:00:00 2001
From: Dongdong Tian <seisman.info@gmail.com>
Date: Thu, 21 Mar 2024 19:01:15 +0800
Subject: [PATCH 2/8] Apply suggestions from code review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Yvonne Fröhlich <94163266+yvonnefroehlich@users.noreply.github.com>
---
 pygmt/tests/test_datatypes_dataset.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pygmt/tests/test_datatypes_dataset.py b/pygmt/tests/test_datatypes_dataset.py
index c349ba413dc..7861b6b3119 100644
--- a/pygmt/tests/test_datatypes_dataset.py
+++ b/pygmt/tests/test_datatypes_dataset.py
@@ -12,18 +12,18 @@
 
 def dataframe_from_pandas(filepath_or_buffer, sep=r"\s+", comment="#", header=None):
     """
-    Read a tabular data as pandas.DataFrame object using pandas.read_csv().
+    Read tabular data as pandas.DataFrame object using pandas.read_csv().
 
     The parameters have the same meaning as in ``pandas.read_csv()``.
     """
     try:
         df = pd.read_csv(filepath_or_buffer, sep=sep, comment=comment, header=header)
     except pd.errors.EmptyDataError:
-        # Return an empty DataFrame if the file has no data
+        # Return an empty DataFrame if the file contains no data
         return pd.DataFrame()
 
     # By default, pandas reads text strings with whitespaces as multiple columns, but
-    # GMT contacatenates all trailing text as a single string column. Neet do find all
+    # GMT concatenates all trailing text as a single string column. Need do find all
     # string columns (with dtype="object") and combine them into a single string column.
     string_columns = df.select_dtypes(include=["object"]).columns
     if len(string_columns) > 1:
@@ -41,7 +41,7 @@ def dataframe_from_pandas(filepath_or_buffer, sep=r"\s+", comment="#", header=No
 
 def dataframe_from_gmt(fname):
     """
-    Read a tabular data as pandas.DataFrame using GMT virtual file.
+    Read tabular data as pandas.DataFrame using GMT virtual file.
     """
     with Session() as lib:
         with lib.virtualfile_out(kind="dataset") as vouttbl:
@@ -71,7 +71,7 @@ def test_dataset():
 
 def test_dataset_empty():
     """
-    Make sure that an empty DataFrame is returned if a file has no data.
+    Make sure that an empty DataFrame is returned if a file contains no data.
     """
     with GMTTempFile(suffix=".txt") as tmpfile:
         with Path(tmpfile.name).open(mode="w") as fp:

From a2c48d5486c88d8eb276942f037623de8abed84e Mon Sep 17 00:00:00 2001
From: Dongdong Tian <seisman.info@gmail.com>
Date: Wed, 27 Mar 2024 10:31:25 +0800
Subject: [PATCH 3/8] Fixes

---
 pygmt/datatypes/dataset.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/pygmt/datatypes/dataset.py b/pygmt/datatypes/dataset.py
index f6bde88dc3b..d4aca2dbbf5 100644
--- a/pygmt/datatypes/dataset.py
+++ b/pygmt/datatypes/dataset.py
@@ -156,9 +156,7 @@ def to_dataframe(
         the same. The same column in all segments of all tables are concatenated. The
         trailing text column is also concatenated as a single string column.
 
-        <<<<<<< HEAD
-        If the object has no data, an empty DataFrame will be returned.
-        =======
+        If the object contains no data, an empty DataFrame will be returned.
 
         Parameters
         ----------
@@ -169,7 +167,6 @@ def to_dataframe(
             column names to types.
         index_col
             Column to set as index.
-        >>>>>>> main
 
         Returns
         -------

From 1281ec0e454ba014e0d6e79adb63078122aead02 Mon Sep 17 00:00:00 2001
From: Dongdong Tian <seisman.info@gmail.com>
Date: Wed, 27 Mar 2024 13:47:48 +0800
Subject: [PATCH 4/8] Add more comments

---
 pygmt/datatypes/dataset.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pygmt/datatypes/dataset.py b/pygmt/datatypes/dataset.py
index d4aca2dbbf5..dd6aba66170 100644
--- a/pygmt/datatypes/dataset.py
+++ b/pygmt/datatypes/dataset.py
@@ -234,10 +234,10 @@ def to_dataframe(
 
         # Create a DataFrame object by concatenating multiple columns
         df = pd.concat(objs=vectors, axis="columns")
-        if column_names is not None:
+        if column_names is not None:  # Assign column names
             df.columns = column_names
-        if dtype is not None:
+        if dtype is not None:  # Set dtype for the whole dataset or individual columns
             df = df.astype(dtype)
-        if index_col is not None:
+        if index_col is not None:  # Use a specific column as index
             df = df.set_index(index_col)
         return df

From 71cc9b7d8c5b3c9ae9c0765d501e1cc5d79687a0 Mon Sep 17 00:00:00 2001
From: Dongdong Tian <seisman.info@gmail.com>
Date: Fri, 29 Mar 2024 09:46:01 +0800
Subject: [PATCH 5/8] Return an empty DataFrame with column names

Co-authored-by: Wei Ji <23487320+weiji14@users.noreply.github.com>
---
 pygmt/datatypes/dataset.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pygmt/datatypes/dataset.py b/pygmt/datatypes/dataset.py
index dd6aba66170..a3c2f65c7bf 100644
--- a/pygmt/datatypes/dataset.py
+++ b/pygmt/datatypes/dataset.py
@@ -230,10 +230,10 @@ def to_dataframe(
 
         # Return an empty DataFrame if no columns are found.
         if len(vectors) == 0:
-            return pd.DataFrame()
-
-        # Create a DataFrame object by concatenating multiple columns
-        df = pd.concat(objs=vectors, axis="columns")
+            df = pd.DataFrame(columns=column_names)
+        else:
+            # Create a DataFrame object by concatenating multiple columns
+            df = pd.concat(objs=vectors, axis="columns")
         if column_names is not None:  # Assign column names
             df.columns = column_names
         if dtype is not None:  # Set dtype for the whole dataset or individual columns

From 065ec12492018e8fac6839a4db3dfbcbaa618612 Mon Sep 17 00:00:00 2001
From: Dongdong Tian <seisman.info@gmail.com>
Date: Fri, 29 Mar 2024 09:57:48 +0800
Subject: [PATCH 6/8] Do not assign column names again for empty DataFrame

---
 pygmt/datatypes/dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pygmt/datatypes/dataset.py b/pygmt/datatypes/dataset.py
index a3c2f65c7bf..6b45e001556 100644
--- a/pygmt/datatypes/dataset.py
+++ b/pygmt/datatypes/dataset.py
@@ -234,8 +234,8 @@ def to_dataframe(
         else:
             # Create a DataFrame object by concatenating multiple columns
             df = pd.concat(objs=vectors, axis="columns")
-        if column_names is not None:  # Assign column names
-            df.columns = column_names
+            if column_names is not None:  # Assign column names
+                df.columns = column_names
         if dtype is not None:  # Set dtype for the whole dataset or individual columns
             df = df.astype(dtype)
         if index_col is not None:  # Use a specific column as index

From dbfc2aead80724f723780ca903b68368740b2eb0 Mon Sep 17 00:00:00 2001
From: Dongdong Tian <seisman.info@gmail.com>
Date: Fri, 29 Mar 2024 11:26:40 +0800
Subject: [PATCH 7/8] Improve type hints

---
 pygmt/datatypes/dataset.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pygmt/datatypes/dataset.py b/pygmt/datatypes/dataset.py
index 6b45e001556..ae402270cb9 100644
--- a/pygmt/datatypes/dataset.py
+++ b/pygmt/datatypes/dataset.py
@@ -3,7 +3,8 @@
 """
 
 import ctypes as ctp
-from typing import ClassVar
+from collections.abc import Mapping
+from typing import Any, ClassVar
 
 import numpy as np
 import pandas as pd
@@ -145,8 +146,8 @@ class _GMT_DATASEGMENT(ctp.Structure):  # noqa: N801
 
     def to_dataframe(
         self,
-        column_names: list[str] | None = None,
-        dtype: type | dict[str, type] | None = None,
+        column_names: pd.Index | None = None,
+        dtype: type | Mapping[Any, type] | None = None,
         index_col: str | int | None = None,
     ) -> pd.DataFrame:
         """

From 06790e280659d6e700a2b64a3b360f95109baf43 Mon Sep 17 00:00:00 2001
From: Dongdong Tian <seisman.info@gmail.com>
Date: Fri, 29 Mar 2024 12:06:26 +0800
Subject: [PATCH 8/8] Apply suggestions from code review

Co-authored-by: Wei Ji <23487320+weiji14@users.noreply.github.com>
---
 pygmt/datatypes/dataset.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pygmt/datatypes/dataset.py b/pygmt/datatypes/dataset.py
index ae402270cb9..7a61b7f3d91 100644
--- a/pygmt/datatypes/dataset.py
+++ b/pygmt/datatypes/dataset.py
@@ -157,7 +157,8 @@ def to_dataframe(
         the same. The same column in all segments of all tables are concatenated. The
         trailing text column is also concatenated as a single string column.
 
-        If the object contains no data, an empty DataFrame will be returned.
+        If the object contains no data, an empty DataFrame will be returned (with the
+        column names and dtypes set if provided).
 
         Parameters
         ----------
@@ -229,8 +230,8 @@ def to_dataframe(
                 pd.Series(data=np.char.decode(textvector), dtype=pd.StringDtype())
             )
 
-        # Return an empty DataFrame if no columns are found.
         if len(vectors) == 0:
+            # Return an empty DataFrame if no columns are found.
             df = pd.DataFrame(columns=column_names)
         else:
             # Create a DataFrame object by concatenating multiple columns