Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GMT_DATASET.to_dataframe: Return an empty DataFrame if a file contains no data #3131

Merged
merged 12 commits into from
Mar 29, 2024
15 changes: 11 additions & 4 deletions pygmt/datatypes/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ class _GMT_DATASET(ctp.Structure): # noqa: N801
"""
GMT dataset structure for holding multiple tables (files).

This class is only meant for internal use by PyGMT and is not exposed to users.
See the GMT source code gmt_resources.h for the original C struct definitions.
This class is only meant for internal use and is not exposed to users. See the GMT
source code ``gmt_resources.h`` for the original C struct definitions.

Examples
--------
Expand Down Expand Up @@ -156,6 +156,8 @@ def to_dataframe(
the same. The same column in all segments of all tables are concatenated. The
trailing text column is also concatenated as a single string column.

If the object contains no data, an empty DataFrame will be returned.
seisman marked this conversation as resolved.
Show resolved Hide resolved

Parameters
----------
column_names
Expand Down Expand Up @@ -200,8 +202,8 @@ def to_dataframe(
>>> df.dtypes.to_list()
[dtype('float64'), dtype('float64'), dtype('float64'), string[python]]
"""
# Deal with numeric columns
vectors = []
# Deal with numeric columns
for icol in range(self.n_columns):
colvector = []
for itbl in range(self.n_tables):
Expand All @@ -226,8 +228,13 @@ def to_dataframe(
pd.Series(data=np.char.decode(textvector), dtype=pd.StringDtype())
)

# Return an empty DataFrame if no columns are found.
if len(vectors) == 0:
seisman marked this conversation as resolved.
Show resolved Hide resolved
return pd.DataFrame()
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently, it returns an empty DataFrame without columns and rows, but an empty DataFrame with columns is also allowed, e.g.,

return pd.DataFrame(column=column_names)

I guess either is fine. I think we can use return pd.DataFrame() now and make changes if necessary.

Copy link
Member

@weiji14 weiji14 Mar 29, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should return the column_names, so that users who want to e.g. do pd.concat on multiple pd.DataFrame outputs from running an algorithm like pygmt.select in a for-loop can do so in a more straightforward way. Note that we should also set the dtypes of the columns properly, even if the rows are empty, otherwise the dtypes will all become object:

df1 = pd.DataFrame(data=[[0, 1, 2]], columns=["x", "y", "z"])
print(df1.dtypes)
# x    int64
# y    int64
# z    int64
# dtype: object
df2 = pd.DataFrame(columns=["x", "y", "z"])
print(df2.dtypes)
# x    object
# y    object
# z    object
# dtype: object

pd.concat(objs=[df1, df2]).dtypes
# x    object
# y    object
# z    object
# dtype: object

See my other suggestion at #3131 (comment) on not returning an empty pd.DataFrame() early, until the dtype is set with df.astype(dtype) below.


# Create a DataFrame object by concatenating multiple columns
df = pd.concat(objs=vectors, axis="columns")
seisman marked this conversation as resolved.
Show resolved Hide resolved
if column_names is not None: # Assign column names
if column_names is not None:
df.columns = column_names
if dtype is not None:
df = df.astype(dtype)
Expand Down
83 changes: 83 additions & 0 deletions pygmt/tests/test_datatypes_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""
Tests for GMT_DATASET data type.
"""

from pathlib import Path

import pandas as pd
import pytest
from pygmt.clib import Session
from pygmt.helpers import GMTTempFile


def dataframe_from_pandas(filepath_or_buffer, sep=r"\s+", comment="#", header=None):
"""
Read tabular data as pandas.DataFrame object using pandas.read_csv().

The parameters have the same meaning as in ``pandas.read_csv()``.
"""
try:
df = pd.read_csv(filepath_or_buffer, sep=sep, comment=comment, header=header)
except pd.errors.EmptyDataError:
# Return an empty DataFrame if the file contains no data
return pd.DataFrame()

# By default, pandas reads text strings with whitespaces as multiple columns, but
# GMT concatenates all trailing text as a single string column. Need do find all
# string columns (with dtype="object") and combine them into a single string column.
string_columns = df.select_dtypes(include=["object"]).columns
if len(string_columns) > 1:
df[string_columns[0]] = df[string_columns].apply(lambda x: " ".join(x), axis=1)
df = df.drop(string_columns[1:], axis=1)
# Convert 'object' to 'string' type
df = df.convert_dtypes(
convert_string=True,
convert_integer=False,
convert_boolean=False,
convert_floating=False,
)
return df


def dataframe_from_gmt(fname):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For reference, GMT provides two special/undocumented modules read and write (their source codes are gmt/src/gmtread.c/gmt/src/gmtwrite.c) that can read a file into a GMT object (e.g, reading a tabular file as GMT_DATASET, or reading a grid as GMT_GRID). Currently, we're frequently using the special read module in the doctest of the pygmt.clib.session module (similar to lines 46-50 below). We may want to make it public in the future as already done in GMT.jl (https://www.generic-mapping-tools.org/GMT.jl/dev/#GMT.gmtread-Tuple{String} and https://www.generic-mapping-tools.org/GMT.jl/dev/#GMT.gmtwrite).

"""
Read tabular data as pandas.DataFrame using GMT virtual file.
"""
with Session() as lib:
with lib.virtualfile_out(kind="dataset") as vouttbl:
lib.call_module("read", f"{fname} {vouttbl} -Td")
df = lib.virtualfile_to_dataset(vfname=vouttbl)
return df


@pytest.mark.benchmark
def test_dataset():
"""
Test the basic functionality of GMT_DATASET.
"""
with GMTTempFile(suffix=".txt") as tmpfile:
with Path(tmpfile.name).open(mode="w") as fp:
print(">", file=fp)
print("1.0 2.0 3.0 TEXT1 TEXT23", file=fp)
print("4.0 5.0 6.0 TEXT4 TEXT567", file=fp)
print(">", file=fp)
print("7.0 8.0 9.0 TEXT8 TEXT90", file=fp)
print("10.0 11.0 12.0 TEXT123 TEXT456789", file=fp)

df = dataframe_from_gmt(tmpfile.name)
expected_df = dataframe_from_pandas(tmpfile.name, comment=">")
pd.testing.assert_frame_equal(df, expected_df)


def test_dataset_empty():
"""
Make sure that an empty DataFrame is returned if a file contains no data.
"""
with GMTTempFile(suffix=".txt") as tmpfile:
with Path(tmpfile.name).open(mode="w") as fp:
print("# This is a comment line.", file=fp)

df = dataframe_from_gmt(tmpfile.name)
assert df.empty # Empty DataFrame
expected_df = dataframe_from_pandas(tmpfile.name)
pd.testing.assert_frame_equal(df, expected_df)