Skip to content

Commit

Permalink
Merge pull request OSGeo#9586 from rouault/parquet_geoarrow
Browse files Browse the repository at this point in the history
Parquet/Arrow: add support for GeoArrow (struct/separate) encoding (GeoParquet 1.1)
  • Loading branch information
rouault authored Apr 15, 2024
2 parents cf937e6 + 5982e63 commit 5beb5cf
Show file tree
Hide file tree
Showing 15 changed files with 2,326 additions and 585 deletions.
4 changes: 2 additions & 2 deletions autotest/ogr/ogr_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def test_ogr_arrow_read_all_geom_types(filename_prefix, dim):
],
)
@pytest.mark.parametrize("dim", ["", "_z", "_m", "_zm"])
@pytest.mark.parametrize("encoding", ["WKB", "WKT", "GEOARROW"])
@pytest.mark.parametrize("encoding", ["WKB", "WKT", "GEOARROW", "GEOARROW_INTERLEAVED"])
def test_ogr_arrow_write_all_geom_types(filename_prefix, dim, encoding):

test_filename = (
Expand All @@ -124,7 +124,7 @@ def test_ogr_arrow_write_all_geom_types(filename_prefix, dim, encoding):
ds_ref = ogr.Open(test_filename)
lyr_ref = ds_ref.GetLayer(0)

if encoding != "GEOARROW" or lyr_ref.GetGeomType() not in (
if not encoding.startswith("GEOARROW") or lyr_ref.GetGeomType() not in (
ogr.wkbGeometryCollection,
ogr.wkbGeometryCollection25D,
ogr.wkbGeometryCollectionM,
Expand Down
147 changes: 147 additions & 0 deletions autotest/ogr/ogr_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -3482,3 +3482,150 @@ def check_file(filename):
layerCreationOptions=["SORT_BY_BBOX=YES", "ROW_GROUP_SIZE=100"],
)
check_file(outfilename2)


###############################################################################
# Check GeoArrow struct encoding


@pytest.mark.parametrize(
"wkt",
[
"POINT (1 2)",
"POINT Z (1 2 3)",
"LINESTRING (1 2,3 4)",
"LINESTRING Z (1 2 3,4 5 6)",
"POLYGON ((0 1,2 3,10 20,0 1))",
"POLYGON ((0 0,0 10,10 10,10 0,0 0),(1 1,1 9,9 9,9 1,1 1))",
"POLYGON Z ((0 1 10,2 3 20,10 20 30,0 1 10))",
"MULTIPOINT ((1 2),(3 4))",
"MULTIPOINT Z ((1 2 3),(4 5 6))",
"MULTILINESTRING ((1 2,3 4),(5 6,7 8,9 10))",
"MULTILINESTRING Z ((1 2 3,4 5 6),(7 8 9,10 11 12,13 14 15))",
"MULTIPOLYGON (((0 1,2 3,10 20,0 1)),((100 110,100 120,120 120,100 110)))",
"MULTIPOLYGON (((0 0,0 10,10 10,10 0,0 0),(1 1,1 9,9 9,9 1,1 1)),((100 110,100 120,120 120,100 110)))",
"MULTIPOLYGON Z (((0 1 10,2 3 20,10 20 30,0 1 10)))",
],
)
@pytest.mark.parametrize("check_with_pyarrow", [True, False])
@pytest.mark.parametrize("covering_bbox", [True, False])
@gdaltest.enable_exceptions()
def test_ogr_parquet_geoarrow(
tmp_vsimem, tmp_path, wkt, check_with_pyarrow, covering_bbox
):

geom = ogr.CreateGeometryFromWkt(wkt)

if check_with_pyarrow:
pa_parquet = pytest.importorskip("pyarrow.parquet")
filename = str(tmp_path / "test_ogr_parquet_geoarrow.parquet")
else:
filename = str(tmp_vsimem / "test_ogr_parquet_geoarrow.parquet")

ds = ogr.GetDriverByName("Parquet").CreateDataSource(filename)

lyr = ds.CreateLayer(
"test",
geom_type=geom.GetGeometryType(),
options=[
"GEOMETRY_ENCODING=GEOARROW",
"WRITE_COVERING_BBOX=" + ("YES" if covering_bbox else "NO"),
],
)
lyr.CreateField(ogr.FieldDefn("foo"))

# Nominal geometry
f = ogr.Feature(lyr.GetLayerDefn())
f.SetGeometry(geom)
lyr.CreateFeature(f)

# Null geometry
f = ogr.Feature(lyr.GetLayerDefn())
lyr.CreateFeature(f)

# Empty geometry
f = ogr.Feature(lyr.GetLayerDefn())
f.SetGeometry(ogr.Geometry(geom.GetGeometryType()))
lyr.CreateFeature(f)

# Nominal geometry
f = ogr.Feature(lyr.GetLayerDefn())
f.SetGeometry(geom)
lyr.CreateFeature(f)

geom2 = None
if geom.GetGeometryCount() > 1:
geom2 = geom.Clone()
geom2.RemoveGeometry(1)
f = ogr.Feature(lyr.GetLayerDefn())
f.SetGeometry(geom2)
lyr.CreateFeature(f)

ds = None

# Check we actually use a GeoArrow encoding
if check_with_pyarrow:
table = pa_parquet.read_table(filename)
import pyarrow as pa

if geom.GetGeometryType() in [ogr.wkbPoint, ogr.wkbPoint25D]:
assert pa.types.is_struct(table.schema.field("geometry").type)
else:
assert pa.types.is_list(table.schema.field("geometry").type)

_validate(filename)

def check(lyr):
assert lyr.GetGeomType() == geom.GetGeometryType()

f = lyr.GetNextFeature()
ogrtest.check_feature_geometry(f, geom)

f = lyr.GetNextFeature()
assert f.GetGeometryRef() is None

f = lyr.GetNextFeature()
ogrtest.check_feature_geometry(f, ogr.Geometry(geom.GetGeometryType()))

f = lyr.GetNextFeature()
ogrtest.check_feature_geometry(f, geom)

if geom2:
f = lyr.GetNextFeature()
ogrtest.check_feature_geometry(f, geom2)

ds = ogr.Open(filename)
lyr = ds.GetLayer(0)
check(lyr)

# Check that ignoring attribute fields doesn't impact geometry reading
ds = ogr.Open(filename)
lyr = ds.GetLayer(0)
lyr.SetIgnoredFields(["foo"])
check(lyr)

ds = ogr.Open(filename)
lyr = ds.GetLayer(0)
minx, maxx, miny, maxy = geom.GetEnvelope()

lyr.SetSpatialFilter(geom)
assert lyr.GetFeatureCount() == (3 if geom.GetGeometryCount() > 1 else 2)

lyr.SetSpatialFilterRect(maxx + 1, miny, maxx + 2, maxy)
assert lyr.GetFeatureCount() == 0

lyr.SetSpatialFilterRect(minx, maxy + 1, maxx, maxy + 2)
assert lyr.GetFeatureCount() == 0

lyr.SetSpatialFilterRect(minx - 2, miny, minx - 1, maxy)
assert lyr.GetFeatureCount() == 0

lyr.SetSpatialFilterRect(minx, miny - 2, maxx, miny - 1)
assert lyr.GetFeatureCount() == 0
if (
minx != miny
and maxx != maxy
and ogr.GT_Flatten(geom.GetGeometryType()) != ogr.wkbMultiPoint
):
lyr.SetSpatialFilterRect(minx + 0.1, miny + 0.1, maxx - 0.1, maxy - 0.1)
assert lyr.GetFeatureCount() != 0
8 changes: 8 additions & 0 deletions autotest/pymod/ogrtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,14 @@ def check_feature_geometry(
if ogr.GT_Flatten(actual.GetGeometryType()) == ogr.wkbPoint:
count = 1

# Point Empty is often encoded with NaN values, hence do not attempt
# X/Y comparisons
if expected.IsEmpty():
assert actual.IsEmpty()
return
else:
assert not actual.IsEmpty()

for i in range(count):
actual_pt = [actual.GetX(i), actual.GetY(i)]
expected_pt = [expected.GetX(i), expected.GetY(i)]
Expand Down
9 changes: 8 additions & 1 deletion doc/source/drivers/vector/arrow.rst
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,17 @@ Layer creation options
"/vsistdout/" or its extension is ".arrows", in which case STREAM is used.

- .. lco:: GEOMETRY_ENCODING
:choices: GEOARROW, WKB, WKT
:choices: GEOARROW, WKB, WKT, GEOARROW_INTERLEAVED
:default: GEOARROW

Geometry encoding.
As of GDAL 3.9, GEOARROW uses the GeoArrow "struct" based
encodings (where points are modeled as a struct field with a x and y subfield,
lines are modeled as a list of such points, etc.).
The GEOARROW_INTERLEAVED option has been renamed in GDAL 3.9 from what was
named GEOARROW in previous versions, and uses an encoding where points uses
a FixedSizedList of (x,y), lines a variable-size list of such
FixedSizedList of points, etc.

- .. lco:: BATCH_SIZE
:choices: <integer>
Expand Down
26 changes: 18 additions & 8 deletions doc/source/drivers/vector/parquet.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,8 @@ Parquet is available in multiple languages including Java, C++, Python, etc..."

This driver also supports geometry columns using the GeoParquet specification.

.. note:: The driver should be considered experimental as the GeoParquet specification is not finalized yet.

The GeoParquet 1.0.0 specification is supported since GDAL 3.8.0
The GeoParquet 1.0.0 specification is supported since GDAL 3.8.0.
The GeoParquet 1.1.0 specification is supported since GDAL 3.9.0.

Driver capabilities
-------------------
Expand Down Expand Up @@ -67,13 +66,21 @@ Layer creation options
Defaults to SNAPPY when available, otherwise NONE.

- .. lco:: GEOMETRY_ENCODING
:choices: WKB, WKT, GEOARROW
:choices: WKB, WKT, GEOARROW, GEOARROW_INTERLEAVED
:default: WKB

Geometry encoding.
Other encodings (WKT and GEOARROW) are *not* allowed by the GeoParquet
specification, but are handled as an extension, for symmetry with the Arrow
driver.
WKB is the default and recommended choice for maximal interoperability.
WKT is *not* allowed by the GeoParquet specification, but are handled as
an extension.
As of GDAL 3.9, GEOARROW uses the GeoParquet 1.1 GeoArrow "struct" based
encodings (where points are modeled as a struct field with a x and y subfield,
lines are modeled as a list of such points, etc.).
The GEOARROW_INTERLEAVED option has been renamed in GDAL 3.9 from what was
named GEOARROW in previous versions, and uses an encoding where points uses
a FixedSizedList of (x,y), lines a variable-size list of such
FixedSizedList of points, etc. This GEOARROW_INTERLEAVED encoding is not
part of the official GeoParquet specification, and its use is not encouraged.

- .. lco:: ROW_GROUP_SIZE
:choices: <integer>
Expand Down Expand Up @@ -121,7 +128,10 @@ Layer creation options
:since: 3.9

Whether to write xmin/ymin/xmax/ymax columns with the bounding box of
geometries.
geometries. Writing the geometry bounding box may help applications to
perform faster spatial filtering. Writing a geometry bounding box is less
necessary for the GeoArrow geometry encoding than for the default WKB, as
implementations may be able to directly use the geometry columns.

- .. lco:: SORT_BY_BBOX
:choices: YES, NO
Expand Down
6 changes: 5 additions & 1 deletion ogr/ogrsf_frmts/arrow/ogrfeatherdriver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -370,10 +370,14 @@ void OGRFeatherDriver::InitMetadata()
CPLAddXMLAttributeAndValue(psOption, "description",
"Encoding of geometry columns");
CPLAddXMLAttributeAndValue(psOption, "default", "GEOARROW");
for (const char *pszEncoding : {"GEOARROW", "WKB", "WKT"})
for (const char *pszEncoding :
{"GEOARROW", "GEOARROW_INTERLEAVED", "WKB", "WKT"})
{
auto poValueNode = CPLCreateXMLNode(psOption, CXT_Element, "Value");
CPLCreateXMLNode(poValueNode, CXT_Text, pszEncoding);
if (EQUAL(pszEncoding, "GEOARROW"))
CPLAddXMLAttributeAndValue(poValueNode, "alias",
"GEOARROW_STRUCT");
}
}

Expand Down
19 changes: 12 additions & 7 deletions ogr/ogrsf_frmts/arrow/ogrfeatherwriterlayer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,15 +97,18 @@ bool OGRFeatherWriterLayer::SetOptions(const std::string &osFilename,

const char *pszGeomEncoding =
CSLFetchNameValue(papszOptions, "GEOMETRY_ENCODING");
m_eGeomEncoding = OGRArrowGeomEncoding::GEOARROW_GENERIC;
m_eGeomEncoding = OGRArrowGeomEncoding::GEOARROW_STRUCT_GENERIC;
if (pszGeomEncoding)
{
if (EQUAL(pszGeomEncoding, "WKB"))
m_eGeomEncoding = OGRArrowGeomEncoding::WKB;
else if (EQUAL(pszGeomEncoding, "WKT"))
m_eGeomEncoding = OGRArrowGeomEncoding::WKT;
else if (EQUAL(pszGeomEncoding, "GEOARROW"))
m_eGeomEncoding = OGRArrowGeomEncoding::GEOARROW_GENERIC;
else if (EQUAL(pszGeomEncoding, "GEOARROW_INTERLEAVED"))
m_eGeomEncoding = OGRArrowGeomEncoding::GEOARROW_FSL_GENERIC;
else if (EQUAL(pszGeomEncoding, "GEOARROW") ||
EQUAL(pszGeomEncoding, "GEOARROW_STRUCT"))
m_eGeomEncoding = OGRArrowGeomEncoding::GEOARROW_STRUCT_GENERIC;
else
{
CPLError(CE_Failure, CPLE_NotSupported,
Expand All @@ -129,10 +132,12 @@ bool OGRFeatherWriterLayer::SetOptions(const std::string &osFilename,

m_poFeatureDefn->SetGeomType(eGType);
auto eGeomEncoding = m_eGeomEncoding;
if (eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_GENERIC)
if (eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_FSL_GENERIC ||
eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_STRUCT_GENERIC)
{
eGeomEncoding = GetPreciseArrowGeomEncoding(eGType);
if (eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_GENERIC)
const auto eEncodingType = eGeomEncoding;
eGeomEncoding = GetPreciseArrowGeomEncoding(eEncodingType, eGType);
if (eGeomEncoding == eEncodingType)
return false;
}
m_aeGeomEncoding.push_back(eGeomEncoding);
Expand Down Expand Up @@ -237,7 +242,7 @@ void OGRFeatherWriterLayer::CreateSchema()
CPLJSONObject oColumn;
oColumns.Add(poGeomFieldDefn->GetNameRef(), oColumn);
oColumn.Add("encoding",
GetGeomEncodingAsString(m_aeGeomEncoding[i], true));
GetGeomEncodingAsString(m_aeGeomEncoding[i], false));

const auto poSRS = poGeomFieldDefn->GetSpatialRef();
if (poSRS)
Expand Down
37 changes: 29 additions & 8 deletions ogr/ogrsf_frmts/arrow_common/ogr_arrow.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,24 @@ enum class OGRArrowGeomEncoding
{
WKB,
WKT,
GEOARROW_GENERIC, // only used by OGRArrowWriterLayer::m_eGeomEncoding
GEOARROW_POINT,
GEOARROW_LINESTRING,
GEOARROW_POLYGON,
GEOARROW_MULTIPOINT,
GEOARROW_MULTILINESTRING,
GEOARROW_MULTIPOLYGON,

// F(ixed) S(ize) L(ist) of (x,y[,z][,m]) values / Interleaved layout
GEOARROW_FSL_GENERIC, // only used by OGRArrowWriterLayer::m_eGeomEncoding
GEOARROW_FSL_POINT,
GEOARROW_FSL_LINESTRING,
GEOARROW_FSL_POLYGON,
GEOARROW_FSL_MULTIPOINT,
GEOARROW_FSL_MULTILINESTRING,
GEOARROW_FSL_MULTIPOLYGON,

// Struct of (x,y,[,z][,m])
GEOARROW_STRUCT_GENERIC, // only used by OGRArrowWriterLayer::m_eGeomEncoding
GEOARROW_STRUCT_POINT,
GEOARROW_STRUCT_LINESTRING,
GEOARROW_STRUCT_POLYGON,
GEOARROW_STRUCT_MULTIPOINT,
GEOARROW_STRUCT_MULTILINESTRING,
GEOARROW_STRUCT_MULTIPOLYGON,
};

/************************************************************************/
Expand Down Expand Up @@ -235,6 +246,11 @@ class OGRArrowLayer CPL_NON_FINAL
int GetNextArrowArray(struct ArrowArrayStream *,
struct ArrowArray *out) override;

virtual void IncrFeatureIdx()
{
++m_nFeatureIdx;
}

public:
virtual ~OGRArrowLayer() override;

Expand Down Expand Up @@ -340,6 +356,10 @@ class OGRArrowWriterLayer CPL_NON_FINAL : public OGRLayer
std::vector<OGRArrowGeomEncoding> m_aeGeomEncoding{};
int m_nWKTCoordinatePrecision = -1;

//! Base struct data type for GeoArrow struct geometry columns.
// Constraint: if not empty, m_apoBaseStructGeomType.size() == m_poFeatureDefn->GetGeomFieldCount()
std::vector<std::shared_ptr<arrow::DataType>> m_apoBaseStructGeomType{};

//! Whether to use a struct field with the values of the bounding box
// of the geometries. Used by Parquet.
bool m_bWriteBBoxStruct = false;
Expand Down Expand Up @@ -376,7 +396,8 @@ class OGRArrowWriterLayer CPL_NON_FINAL : public OGRLayer
m_oSetWrittenGeometryTypes{}; // size: GetGeomFieldCount()

static OGRArrowGeomEncoding
GetPreciseArrowGeomEncoding(OGRwkbGeometryType eGType);
GetPreciseArrowGeomEncoding(OGRArrowGeomEncoding eEncodingType,
OGRwkbGeometryType eGType);
static const char *
GetGeomEncodingAsString(OGRArrowGeomEncoding eGeomEncoding,
bool bForParquetGeo);
Expand Down
Loading

0 comments on commit 5beb5cf

Please sign in to comment.