Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pyarrow: Check compatibility of pyarrow.array with string type #2933

Merged
merged 30 commits into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
1f32a7c
Check passing pyarrow.array with string type to pygmt.text
weiji14 Dec 30, 2023
4c4e064
Check passing pyarrow.array with string type to virtualfile_from_vectors
weiji14 Dec 30, 2023
07fbca6
Merge branch 'main' into pyarrow/string
weiji14 Oct 11, 2024
d379e46
Enable passing pyarrow.StringArray to clib.Session.put_strings
weiji14 Oct 11, 2024
cfda386
Use "string" instead of pyarrow.string() in case pyarrow not installed
weiji14 Oct 11, 2024
0a6cda5
Try to fix type hints
weiji14 Oct 11, 2024
f59f93c
Add np.ndarray to StringArrayTypes and fix/ignore remaining type errors
weiji14 Oct 11, 2024
757da24
Merge branch 'main' into pyarrow/string
weiji14 Oct 11, 2024
17c1e9c
Move StringArrayTypes to pygmt/_typing.py
weiji14 Oct 11, 2024
0105d64
Add pyarrow to docs CI
weiji14 Oct 11, 2024
3ad0c86
Use np.asarray to convert pa.StringArray instead of .to_pylist()
weiji14 Oct 11, 2024
4bea288
Update note to say that PyArrow string types are now supported
weiji14 Oct 11, 2024
371174a
Add back pytest.mark.benchmark marker
weiji14 Oct 11, 2024
b588730
Add intersphinx link for pyarrow
weiji14 Oct 11, 2024
faf2065
Apply suggestions from code review
weiji14 Nov 6, 2024
b2efbb4
Merge branch 'main' into pyarrow/string
weiji14 Nov 6, 2024
9fd77dc
format
weiji14 Nov 6, 2024
ccf4eff
Merge branch 'main' into pyarrow/string
weiji14 Nov 7, 2024
44d01ed
Merge branch 'main' into pyarrow/string
weiji14 Nov 15, 2024
a927202
Revert "Enable passing pyarrow.StringArray to clib.Session.put_strings"
weiji14 Nov 15, 2024
7b00248
Reduce diff from messy revert handling
weiji14 Nov 15, 2024
7dc353b
Revert support of pyarrow.array inputs to put_strings
weiji14 Nov 15, 2024
ce76152
Remove StringArrayTypes type hint
weiji14 Nov 15, 2024
ef431af
Revert "Remove StringArrayTypes type hint"
weiji14 Nov 15, 2024
acaf350
Improve type-hint of text parameter in pygmt.Figure.text
weiji14 Nov 15, 2024
6ad6eb9
Move pa.array parametrizations to another file
weiji14 Nov 15, 2024
d88accd
Pass a tuple of vectors to virtualfile_from_vectors
weiji14 Nov 15, 2024
265132e
Move skip_if_no and pyarrow import to test_clib_virtualfile_from_vectors
weiji14 Nov 15, 2024
edb3438
The text argument can be None
weiji14 Nov 15, 2024
8172102
Simplify to remove getattr(pa, "array", None) call
weiji14 Nov 15, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/ci_docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ jobs:
contextily
geopandas<1.0
ipython
pyarrow
seisman marked this conversation as resolved.
Show resolved Hide resolved
rioxarray
make
pip
Expand Down
1 change: 1 addition & 0 deletions ci/requirements/docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ dependencies:
- contextily
- geopandas<1.0
- ipython
- pyarrow
- rioxarray
# Development dependencies (general)
- make
Expand Down
10 changes: 10 additions & 0 deletions pygmt/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,17 @@
Type aliases for type hints.
"""

import contextlib
import importlib
from collections.abc import Sequence
from typing import Literal

import numpy as np

# Anchor codes
AnchorCode = Literal["TL", "TC", "TR", "ML", "MC", "MR", "BL", "BC", "BR"]

# String array types
StringArrayTypes = Sequence[str] | np.ndarray
with contextlib.suppress(ImportError):
StringArrayTypes |= importlib.import_module(name="pyarrow").StringArray
weiji14 marked this conversation as resolved.
Show resolved Hide resolved
15 changes: 11 additions & 4 deletions pygmt/clib/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import numpy as np
import pandas as pd
from packaging.version import Version
from pygmt._typing import StringArrayTypes
from pygmt.exceptions import GMTInvalidInput


Expand Down Expand Up @@ -281,14 +282,15 @@ def sequence_to_ctypes_array(
return (ctype * size)(*sequence)


def strings_to_ctypes_array(strings: Sequence[str]) -> ctp.Array:
def strings_to_ctypes_array(strings: StringArrayTypes) -> ctp.Array:
"""
Convert a sequence (e.g., a list) of strings into a ctypes array.
Convert a sequence (e.g., a list) or numpy.ndarray of strings or a
pyarrow.StringArray into a ctypes array.

Parameters
----------
strings
A sequence of strings.
A sequence of strings, a numpy.ndarray of str dtype, or a pyarrow.StringArray.

Returns
-------
Expand All @@ -304,7 +306,12 @@ def strings_to_ctypes_array(strings: Sequence[str]) -> ctp.Array:
>>> [s.decode() for s in ctypes_array]
['first', 'second', 'third']
"""
return (ctp.c_char_p * len(strings))(*[s.encode() for s in strings])
try:
bytes_string_list = [s.encode() for s in strings]
except AttributeError: # 'pyarrow.StringScalar' object has no attribute 'encode'
# Convert pyarrow.StringArray to Python list first
bytes_string_list = [s.encode() for s in strings.to_pylist()] # type: ignore[union-attr]
return (ctp.c_char_p * len(strings))(*bytes_string_list)
weiji14 marked this conversation as resolved.
Show resolved Hide resolved


def array_to_datetime(array):
Expand Down
44 changes: 24 additions & 20 deletions pygmt/clib/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import numpy as np
import pandas as pd
import xarray as xr
from pygmt._typing import StringArrayTypes
from pygmt.clib.conversion import (
array_to_datetime,
dataarray_to_matrix,
Expand Down Expand Up @@ -930,39 +931,43 @@ def put_vector(self, dataset, column, vector):
f"in column {column} of dataset."
)

def put_strings(self, dataset, family, strings):
def put_strings(
self,
dataset: ctp.c_void_p,
family: Literal["GMT_IS_VECTOR", "GMT_IS_MATRIX"],
weiji14 marked this conversation as resolved.
Show resolved Hide resolved
strings: StringArrayTypes,
):
"""
Attach a numpy 1-D array of dtype str as a column on a GMT dataset.
Attach a 1-D numpy array of dtype str or pyarrow.StringArray as a column on a
GMT dataset.

Use this function to attach string type numpy array data to a GMT
dataset and pass it to GMT modules. Wraps ``GMT_Put_Strings``.
Use this function to attach string type array data to a GMT dataset and pass it
to GMT modules. Wraps ``GMT_Put_Strings``.

The dataset must be created by :meth:`pygmt.clib.Session.create_data`
first.
The dataset must be created by :meth:`pygmt.clib.Session.create_data` first.

.. warning::
The numpy array must be C contiguous in memory. If it comes from a
column slice of a 2-D array, for example, you will have to make a
copy. Use :func:`numpy.ascontiguousarray` to make sure your vector
is contiguous (it won't copy if it already is).
The array must be C contiguous in memory. If it comes from a column slice of
a 2-D array, for example, you will have to make a copy. Use
:func:`numpy.ascontiguousarray` to make sure your vector is contiguous (it
won't copy if it already is).

Parameters
----------
dataset : :class:`ctypes.c_void_p`
dataset
The ctypes void pointer to a ``GMT_Dataset``. Create it with
:meth:`pygmt.clib.Session.create_data`.
weiji14 marked this conversation as resolved.
Show resolved Hide resolved
family : str
family
The family type of the dataset. Can be either ``GMT_IS_VECTOR`` or
``GMT_IS_MATRIX``.
strings : numpy 1-D array
The array that will be attached to the dataset. Must be a 1-D C
contiguous array.
strings
The array that will be attached to the dataset. Must be a 1-D C contiguous
array.

Raises
------
GMTCLibError
If given invalid input or ``GMT_Put_Strings`` exits with
status != 0.
If given invalid input or ``GMT_Put_Strings`` exits with status != 0.
"""
c_put_strings = self.get_libgmt_func(
"GMT_Put_Strings",
Expand All @@ -985,9 +990,8 @@ def put_strings(self, dataset, family, strings):
self.session_pointer, family_int, dataset, strings_pointer
)
if status != 0:
raise GMTCLibError(
f"Failed to put strings of type {strings.dtype} into dataset"
)
dtype = strings.dtype if hasattr(strings, "dtype") else type(strings)
raise GMTCLibError(f"Failed to put strings of type {dtype} into dataset")

def put_matrix(self, dataset, matrix, pad=0):
"""
Expand Down
25 changes: 22 additions & 3 deletions pygmt/tests/test_clib_put_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,31 @@
from pygmt import clib
from pygmt.exceptions import GMTCLibError
from pygmt.helpers import GMTTempFile
from pygmt.helpers.testing import skip_if_no
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also need to revert changes in this PR, since the low-level function Session.put_strings only accepts numpy.str_ and everything else should be converted to numpy.str_ by the _to_numpy function before calling put_strings.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, just to document things, the way we are intending pyarrow.string inputs to go through high-level modules like fig.text is:

  1. pygmt.Figure.text -> clib.Session.virtualfile_in -> clib.Session.virtualfile_from_vectors
  2. In clib.Session.virtualfile_from_vectors:
    • First call clib.conversion.vectors_to_arrays -> clib.conversion._to_numpy -> np.ascontiguous_array to convert pyarrow.string to np.str_
    • With a list[np.str_], this is then passed to clib.Session.put_strings

For the record, the original intention in this PR, mostly in commit d379e46, was to let clib.Session.put_strings handle both np.str_ and pyarrow.string.


try:
import pyarrow as pa
except ImportError:
pa = None


@pytest.mark.benchmark
def test_put_strings():
@pytest.mark.parametrize(
("array_func", "dtype"),
[
pytest.param(np.array, {"dtype": str}, id="str"),
pytest.param(
getattr(pa, "array", None),
{"type": "string"}, # pa.string()
marks=skip_if_no(package="pyarrow"),
id="pyarrow",
),
],
)
def test_put_strings(array_func, dtype):
"""
Check that assigning a numpy array of dtype str to a dataset works.
Check that assigning a numpy array of dtype str, or a pyarrow.StringArray to a
dataset works.
"""
with clib.Session() as lib:
dataset = lib.create_data(
Expand All @@ -24,7 +43,7 @@ def test_put_strings():
)
x = np.array([1, 2, 3, 4, 5], dtype=np.int32)
y = np.array([6, 7, 8, 9, 10], dtype=np.int32)
strings = np.array(["a", "bc", "defg", "hijklmn", "opqrst"], dtype=str)
strings = array_func(["a", "bc", "defg", "hijklmn", "opqrst"], **dtype)
lib.put_vector(dataset, column=lib["GMT_X"], vector=x)
lib.put_vector(dataset, column=lib["GMT_Y"], vector=y)
lib.put_strings(
Expand Down
28 changes: 23 additions & 5 deletions pygmt/tests/test_clib_virtualfiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,14 @@
from pygmt import clib
from pygmt.exceptions import GMTCLibError, GMTInvalidInput
from pygmt.helpers import GMTTempFile
from pygmt.helpers.testing import skip_if_no
from pygmt.tests.test_clib import mock

try:
import pyarrow as pa
except ImportError:
pa = None

POINTS_DATA = Path(__file__).parent / "data" / "points.txt"


Expand Down Expand Up @@ -225,16 +231,28 @@ def test_virtualfile_from_vectors(dtypes):
assert output == expected


@pytest.mark.benchmark
@pytest.mark.parametrize("dtype", [str, object])
def test_virtualfile_from_vectors_one_string_or_object_column(dtype):
@pytest.mark.parametrize(
("array_func", "dtype"),
[
pytest.param(np.array, {"dtype": str}, id="str"),
pytest.param(np.array, {"dtype": object}, id="object"),
pytest.param(
getattr(pa, "array", None),
{"type": "string"}, # pa.string()
marks=skip_if_no(package="pyarrow"),
id="pyarrow",
),
],
)
def test_virtualfile_from_vectors_one_string_or_object_column(array_func, dtype):
"""
Test passing in one column with string or object dtype into virtual file dataset.
Test passing in one column with string (numpy/pyarrow) or object (numpy)
dtype into virtual file dataset.
"""
size = 5
x = np.arange(size, dtype=np.int32)
y = np.arange(size, size * 2, 1, dtype=np.int32)
strings = np.array(["a", "bc", "defg", "hijklmn", "opqrst"], dtype=dtype)
strings = array_func(["a", "bc", "defg", "hijklmn", "opqrst"], **dtype)
with clib.Session() as lib:
with lib.virtualfile_from_vectors(x, y, strings) as vfile:
with GMTTempFile() as outfile:
Expand Down
24 changes: 21 additions & 3 deletions pygmt/tests/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@
from pygmt import Figure
from pygmt.exceptions import GMTCLibError, GMTInvalidInput
from pygmt.helpers import GMTTempFile
from pygmt.helpers.testing import skip_if_no

try:
import pyarrow as pa
except ImportError:
pa = None

TEST_DATA_DIR = Path(__file__).parent / "data"
POINTS_DATA = TEST_DATA_DIR / "points.txt"
Expand Down Expand Up @@ -48,8 +54,20 @@ def test_text_single_line_of_text(region, projection):


@pytest.mark.benchmark
@pytest.mark.mpl_image_compare
def test_text_multiple_lines_of_text(region, projection):
@pytest.mark.mpl_image_compare(filename="test_text_multiple_lines_of_text.png")
@pytest.mark.parametrize(
"array_func",
[
list,
pytest.param(np.array, id="numpy"),
pytest.param(
getattr(pa, "array", None),
marks=skip_if_no(package="pyarrow"),
id="pyarrow",
),
],
)
def test_text_multiple_lines_of_text(region, projection, array_func):
"""
Place multiple lines of text at their respective x, y locations.
"""
Expand All @@ -59,7 +77,7 @@ def test_text_multiple_lines_of_text(region, projection):
projection=projection,
x=[1.2, 1.6],
y=[0.6, 0.3],
text=["This is a line of text", "This is another line of text"],
text=array_func(["This is a line of text", "This is another line of text"]),
)
return fig

Expand Down