Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GMT_DATASET.to_dataframe: Return an empty DataFrame if a file contains no data #3131

Merged
merged 12 commits into from
Mar 29, 2024
31 changes: 20 additions & 11 deletions pygmt/datatypes/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
"""

import ctypes as ctp
from typing import ClassVar
from collections.abc import Mapping
from typing import Any, ClassVar

import numpy as np
import pandas as pd
Expand All @@ -13,8 +14,8 @@ class _GMT_DATASET(ctp.Structure): # noqa: N801
"""
GMT dataset structure for holding multiple tables (files).

This class is only meant for internal use by PyGMT and is not exposed to users.
See the GMT source code gmt_resources.h for the original C struct definitions.
This class is only meant for internal use and is not exposed to users. See the GMT
source code ``gmt_resources.h`` for the original C struct definitions.

Examples
--------
Expand Down Expand Up @@ -145,8 +146,8 @@ class _GMT_DATASEGMENT(ctp.Structure): # noqa: N801

def to_dataframe(
self,
column_names: list[str] | None = None,
dtype: type | dict[str, type] | None = None,
column_names: pd.Index | None = None,
dtype: type | Mapping[Any, type] | None = None,
index_col: str | int | None = None,
) -> pd.DataFrame:
"""
Expand All @@ -156,6 +157,9 @@ def to_dataframe(
the same. The same column in all segments of all tables are concatenated. The
trailing text column is also concatenated as a single string column.

If the object contains no data, an empty DataFrame will be returned (with the
column names and dtypes set if provided).

Parameters
----------
column_names
Expand Down Expand Up @@ -200,8 +204,8 @@ def to_dataframe(
>>> df.dtypes.to_list()
[dtype('float64'), dtype('float64'), dtype('float64'), string[python]]
"""
# Deal with numeric columns
vectors = []
# Deal with numeric columns
for icol in range(self.n_columns):
colvector = []
for itbl in range(self.n_tables):
Expand All @@ -226,11 +230,16 @@ def to_dataframe(
pd.Series(data=np.char.decode(textvector), dtype=pd.StringDtype())
)

df = pd.concat(objs=vectors, axis="columns")
if column_names is not None: # Assign column names
df.columns = column_names
if dtype is not None:
if len(vectors) == 0:
# Return an empty DataFrame if no columns are found.
df = pd.DataFrame(columns=column_names)
else:
# Create a DataFrame object by concatenating multiple columns
df = pd.concat(objs=vectors, axis="columns")
if column_names is not None: # Assign column names
df.columns = column_names
if dtype is not None: # Set dtype for the whole dataset or individual columns
df = df.astype(dtype)
if index_col is not None:
if index_col is not None: # Use a specific column as index
df = df.set_index(index_col)
return df
83 changes: 83 additions & 0 deletions pygmt/tests/test_datatypes_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""
Tests for GMT_DATASET data type.
"""

from pathlib import Path

import pandas as pd
import pytest
from pygmt.clib import Session
from pygmt.helpers import GMTTempFile


def dataframe_from_pandas(filepath_or_buffer, sep=r"\s+", comment="#", header=None):
"""
Read tabular data as pandas.DataFrame object using pandas.read_csv().

The parameters have the same meaning as in ``pandas.read_csv()``.
"""
try:
df = pd.read_csv(filepath_or_buffer, sep=sep, comment=comment, header=header)
except pd.errors.EmptyDataError:
# Return an empty DataFrame if the file contains no data
return pd.DataFrame()

# By default, pandas reads text strings with whitespaces as multiple columns, but
# GMT concatenates all trailing text as a single string column. Need do find all
# string columns (with dtype="object") and combine them into a single string column.
string_columns = df.select_dtypes(include=["object"]).columns
if len(string_columns) > 1:
df[string_columns[0]] = df[string_columns].apply(lambda x: " ".join(x), axis=1)
df = df.drop(string_columns[1:], axis=1)
# Convert 'object' to 'string' type
df = df.convert_dtypes(
convert_string=True,
convert_integer=False,
convert_boolean=False,
convert_floating=False,
)
return df


def dataframe_from_gmt(fname):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For reference, GMT provides two special/undocumented modules read and write (their source codes are gmt/src/gmtread.c/gmt/src/gmtwrite.c) that can read a file into a GMT object (e.g, reading a tabular file as GMT_DATASET, or reading a grid as GMT_GRID). Currently, we're frequently using the special read module in the doctest of the pygmt.clib.session module (similar to lines 46-50 below). We may want to make it public in the future as already done in GMT.jl (https://www.generic-mapping-tools.org/GMT.jl/dev/#GMT.gmtread-Tuple{String} and https://www.generic-mapping-tools.org/GMT.jl/dev/#GMT.gmtwrite).

"""
Read tabular data as pandas.DataFrame using GMT virtual file.
"""
with Session() as lib:
with lib.virtualfile_out(kind="dataset") as vouttbl:
lib.call_module("read", f"{fname} {vouttbl} -Td")
df = lib.virtualfile_to_dataset(vfname=vouttbl)
return df


@pytest.mark.benchmark
def test_dataset():
"""
Test the basic functionality of GMT_DATASET.
"""
with GMTTempFile(suffix=".txt") as tmpfile:
with Path(tmpfile.name).open(mode="w") as fp:
print(">", file=fp)
print("1.0 2.0 3.0 TEXT1 TEXT23", file=fp)
print("4.0 5.0 6.0 TEXT4 TEXT567", file=fp)
print(">", file=fp)
print("7.0 8.0 9.0 TEXT8 TEXT90", file=fp)
print("10.0 11.0 12.0 TEXT123 TEXT456789", file=fp)

df = dataframe_from_gmt(tmpfile.name)
expected_df = dataframe_from_pandas(tmpfile.name, comment=">")
pd.testing.assert_frame_equal(df, expected_df)


def test_dataset_empty():
"""
Make sure that an empty DataFrame is returned if a file contains no data.
"""
with GMTTempFile(suffix=".txt") as tmpfile:
with Path(tmpfile.name).open(mode="w") as fp:
print("# This is a comment line.", file=fp)

df = dataframe_from_gmt(tmpfile.name)
assert df.empty # Empty DataFrame
expected_df = dataframe_from_pandas(tmpfile.name)
pd.testing.assert_frame_equal(df, expected_df)