diff --git a/autotest/ogr/ogr_arrow.py b/autotest/ogr/ogr_arrow.py index 4ef38f92988c..22efac1849f5 100755 --- a/autotest/ogr/ogr_arrow.py +++ b/autotest/ogr/ogr_arrow.py @@ -112,7 +112,7 @@ def test_ogr_arrow_read_all_geom_types(filename_prefix, dim): ], ) @pytest.mark.parametrize("dim", ["", "_z", "_m", "_zm"]) -@pytest.mark.parametrize("encoding", ["WKB", "WKT", "GEOARROW"]) +@pytest.mark.parametrize("encoding", ["WKB", "WKT", "GEOARROW", "GEOARROW_INTERLEAVED"]) def test_ogr_arrow_write_all_geom_types(filename_prefix, dim, encoding): test_filename = ( @@ -124,7 +124,7 @@ def test_ogr_arrow_write_all_geom_types(filename_prefix, dim, encoding): ds_ref = ogr.Open(test_filename) lyr_ref = ds_ref.GetLayer(0) - if encoding != "GEOARROW" or lyr_ref.GetGeomType() not in ( + if not encoding.startswith("GEOARROW") or lyr_ref.GetGeomType() not in ( ogr.wkbGeometryCollection, ogr.wkbGeometryCollection25D, ogr.wkbGeometryCollectionM, diff --git a/autotest/ogr/ogr_parquet.py b/autotest/ogr/ogr_parquet.py index b95095672c94..208ccb4ced5e 100755 --- a/autotest/ogr/ogr_parquet.py +++ b/autotest/ogr/ogr_parquet.py @@ -3482,3 +3482,150 @@ def check_file(filename): layerCreationOptions=["SORT_BY_BBOX=YES", "ROW_GROUP_SIZE=100"], ) check_file(outfilename2) + + +############################################################################### +# Check GeoArrow struct encoding + + +@pytest.mark.parametrize( + "wkt", + [ + "POINT (1 2)", + "POINT Z (1 2 3)", + "LINESTRING (1 2,3 4)", + "LINESTRING Z (1 2 3,4 5 6)", + "POLYGON ((0 1,2 3,10 20,0 1))", + "POLYGON ((0 0,0 10,10 10,10 0,0 0),(1 1,1 9,9 9,9 1,1 1))", + "POLYGON Z ((0 1 10,2 3 20,10 20 30,0 1 10))", + "MULTIPOINT ((1 2),(3 4))", + "MULTIPOINT Z ((1 2 3),(4 5 6))", + "MULTILINESTRING ((1 2,3 4),(5 6,7 8,9 10))", + "MULTILINESTRING Z ((1 2 3,4 5 6),(7 8 9,10 11 12,13 14 15))", + "MULTIPOLYGON (((0 1,2 3,10 20,0 1)),((100 110,100 120,120 120,100 110)))", + "MULTIPOLYGON (((0 0,0 10,10 10,10 0,0 0),(1 1,1 9,9 9,9 1,1 1)),((100 110,100 120,120 120,100 110)))", + "MULTIPOLYGON Z (((0 1 10,2 3 20,10 20 30,0 1 10)))", + ], +) +@pytest.mark.parametrize("check_with_pyarrow", [True, False]) +@pytest.mark.parametrize("covering_bbox", [True, False]) +@gdaltest.enable_exceptions() +def test_ogr_parquet_geoarrow( + tmp_vsimem, tmp_path, wkt, check_with_pyarrow, covering_bbox +): + + geom = ogr.CreateGeometryFromWkt(wkt) + + if check_with_pyarrow: + pa_parquet = pytest.importorskip("pyarrow.parquet") + filename = str(tmp_path / "test_ogr_parquet_geoarrow.parquet") + else: + filename = str(tmp_vsimem / "test_ogr_parquet_geoarrow.parquet") + + ds = ogr.GetDriverByName("Parquet").CreateDataSource(filename) + + lyr = ds.CreateLayer( + "test", + geom_type=geom.GetGeometryType(), + options=[ + "GEOMETRY_ENCODING=GEOARROW", + "WRITE_COVERING_BBOX=" + ("YES" if covering_bbox else "NO"), + ], + ) + lyr.CreateField(ogr.FieldDefn("foo")) + + # Nominal geometry + f = ogr.Feature(lyr.GetLayerDefn()) + f.SetGeometry(geom) + lyr.CreateFeature(f) + + # Null geometry + f = ogr.Feature(lyr.GetLayerDefn()) + lyr.CreateFeature(f) + + # Empty geometry + f = ogr.Feature(lyr.GetLayerDefn()) + f.SetGeometry(ogr.Geometry(geom.GetGeometryType())) + lyr.CreateFeature(f) + + # Nominal geometry + f = ogr.Feature(lyr.GetLayerDefn()) + f.SetGeometry(geom) + lyr.CreateFeature(f) + + geom2 = None + if geom.GetGeometryCount() > 1: + geom2 = geom.Clone() + geom2.RemoveGeometry(1) + f = ogr.Feature(lyr.GetLayerDefn()) + f.SetGeometry(geom2) + lyr.CreateFeature(f) + + ds = None + + # Check we actually use a GeoArrow encoding + if check_with_pyarrow: + table = pa_parquet.read_table(filename) + import pyarrow as pa + + if geom.GetGeometryType() in [ogr.wkbPoint, ogr.wkbPoint25D]: + assert pa.types.is_struct(table.schema.field("geometry").type) + else: + assert pa.types.is_list(table.schema.field("geometry").type) + + _validate(filename) + + def check(lyr): + assert lyr.GetGeomType() == geom.GetGeometryType() + + f = lyr.GetNextFeature() + ogrtest.check_feature_geometry(f, geom) + + f = lyr.GetNextFeature() + assert f.GetGeometryRef() is None + + f = lyr.GetNextFeature() + ogrtest.check_feature_geometry(f, ogr.Geometry(geom.GetGeometryType())) + + f = lyr.GetNextFeature() + ogrtest.check_feature_geometry(f, geom) + + if geom2: + f = lyr.GetNextFeature() + ogrtest.check_feature_geometry(f, geom2) + + ds = ogr.Open(filename) + lyr = ds.GetLayer(0) + check(lyr) + + # Check that ignoring attribute fields doesn't impact geometry reading + ds = ogr.Open(filename) + lyr = ds.GetLayer(0) + lyr.SetIgnoredFields(["foo"]) + check(lyr) + + ds = ogr.Open(filename) + lyr = ds.GetLayer(0) + minx, maxx, miny, maxy = geom.GetEnvelope() + + lyr.SetSpatialFilter(geom) + assert lyr.GetFeatureCount() == (3 if geom.GetGeometryCount() > 1 else 2) + + lyr.SetSpatialFilterRect(maxx + 1, miny, maxx + 2, maxy) + assert lyr.GetFeatureCount() == 0 + + lyr.SetSpatialFilterRect(minx, maxy + 1, maxx, maxy + 2) + assert lyr.GetFeatureCount() == 0 + + lyr.SetSpatialFilterRect(minx - 2, miny, minx - 1, maxy) + assert lyr.GetFeatureCount() == 0 + + lyr.SetSpatialFilterRect(minx, miny - 2, maxx, miny - 1) + assert lyr.GetFeatureCount() == 0 + if ( + minx != miny + and maxx != maxy + and ogr.GT_Flatten(geom.GetGeometryType()) != ogr.wkbMultiPoint + ): + lyr.SetSpatialFilterRect(minx + 0.1, miny + 0.1, maxx - 0.1, maxy - 0.1) + assert lyr.GetFeatureCount() != 0 diff --git a/autotest/pymod/ogrtest.py b/autotest/pymod/ogrtest.py index a88fd007f3e1..fae366bc013e 100755 --- a/autotest/pymod/ogrtest.py +++ b/autotest/pymod/ogrtest.py @@ -172,6 +172,14 @@ def check_feature_geometry( if ogr.GT_Flatten(actual.GetGeometryType()) == ogr.wkbPoint: count = 1 + # Point Empty is often encoded with NaN values, hence do not attempt + # X/Y comparisons + if expected.IsEmpty(): + assert actual.IsEmpty() + return + else: + assert not actual.IsEmpty() + for i in range(count): actual_pt = [actual.GetX(i), actual.GetY(i)] expected_pt = [expected.GetX(i), expected.GetY(i)] diff --git a/doc/source/drivers/vector/arrow.rst b/doc/source/drivers/vector/arrow.rst index 4c00c44f6545..775f0f699762 100644 --- a/doc/source/drivers/vector/arrow.rst +++ b/doc/source/drivers/vector/arrow.rst @@ -68,10 +68,17 @@ Layer creation options "/vsistdout/" or its extension is ".arrows", in which case STREAM is used. - .. lco:: GEOMETRY_ENCODING - :choices: GEOARROW, WKB, WKT + :choices: GEOARROW, WKB, WKT, GEOARROW_INTERLEAVED :default: GEOARROW Geometry encoding. + As of GDAL 3.9, GEOARROW uses the GeoArrow "struct" based + encodings (where points are modeled as a struct field with a x and y subfield, + lines are modeled as a list of such points, etc.). + The GEOARROW_INTERLEAVED option has been renamed in GDAL 3.9 from what was + named GEOARROW in previous versions, and uses an encoding where points uses + a FixedSizedList of (x,y), lines a variable-size list of such + FixedSizedList of points, etc. - .. lco:: BATCH_SIZE :choices: diff --git a/doc/source/drivers/vector/parquet.rst b/doc/source/drivers/vector/parquet.rst index 687d83f1ddf4..965b27dd0fe0 100644 --- a/doc/source/drivers/vector/parquet.rst +++ b/doc/source/drivers/vector/parquet.rst @@ -16,9 +16,8 @@ Parquet is available in multiple languages including Java, C++, Python, etc..." This driver also supports geometry columns using the GeoParquet specification. -.. note:: The driver should be considered experimental as the GeoParquet specification is not finalized yet. - -The GeoParquet 1.0.0 specification is supported since GDAL 3.8.0 +The GeoParquet 1.0.0 specification is supported since GDAL 3.8.0. +The GeoParquet 1.1.0 specification is supported since GDAL 3.9.0. Driver capabilities ------------------- @@ -67,13 +66,21 @@ Layer creation options Defaults to SNAPPY when available, otherwise NONE. - .. lco:: GEOMETRY_ENCODING - :choices: WKB, WKT, GEOARROW + :choices: WKB, WKT, GEOARROW, GEOARROW_INTERLEAVED :default: WKB Geometry encoding. - Other encodings (WKT and GEOARROW) are *not* allowed by the GeoParquet - specification, but are handled as an extension, for symmetry with the Arrow - driver. + WKB is the default and recommended choice for maximal interoperability. + WKT is *not* allowed by the GeoParquet specification, but are handled as + an extension. + As of GDAL 3.9, GEOARROW uses the GeoParquet 1.1 GeoArrow "struct" based + encodings (where points are modeled as a struct field with a x and y subfield, + lines are modeled as a list of such points, etc.). + The GEOARROW_INTERLEAVED option has been renamed in GDAL 3.9 from what was + named GEOARROW in previous versions, and uses an encoding where points uses + a FixedSizedList of (x,y), lines a variable-size list of such + FixedSizedList of points, etc. This GEOARROW_INTERLEAVED encoding is not + part of the official GeoParquet specification, and its use is not encouraged. - .. lco:: ROW_GROUP_SIZE :choices: @@ -121,7 +128,10 @@ Layer creation options :since: 3.9 Whether to write xmin/ymin/xmax/ymax columns with the bounding box of - geometries. + geometries. Writing the geometry bounding box may help applications to + perform faster spatial filtering. Writing a geometry bounding box is less + necessary for the GeoArrow geometry encoding than for the default WKB, as + implementations may be able to directly use the geometry columns. - .. lco:: SORT_BY_BBOX :choices: YES, NO diff --git a/ogr/ogrsf_frmts/arrow/ogrfeatherdriver.cpp b/ogr/ogrsf_frmts/arrow/ogrfeatherdriver.cpp index fe136696f7b8..d37d31c340d7 100644 --- a/ogr/ogrsf_frmts/arrow/ogrfeatherdriver.cpp +++ b/ogr/ogrsf_frmts/arrow/ogrfeatherdriver.cpp @@ -370,10 +370,14 @@ void OGRFeatherDriver::InitMetadata() CPLAddXMLAttributeAndValue(psOption, "description", "Encoding of geometry columns"); CPLAddXMLAttributeAndValue(psOption, "default", "GEOARROW"); - for (const char *pszEncoding : {"GEOARROW", "WKB", "WKT"}) + for (const char *pszEncoding : + {"GEOARROW", "GEOARROW_INTERLEAVED", "WKB", "WKT"}) { auto poValueNode = CPLCreateXMLNode(psOption, CXT_Element, "Value"); CPLCreateXMLNode(poValueNode, CXT_Text, pszEncoding); + if (EQUAL(pszEncoding, "GEOARROW")) + CPLAddXMLAttributeAndValue(poValueNode, "alias", + "GEOARROW_STRUCT"); } } diff --git a/ogr/ogrsf_frmts/arrow/ogrfeatherwriterlayer.cpp b/ogr/ogrsf_frmts/arrow/ogrfeatherwriterlayer.cpp index a78792cc731d..13080c5c1123 100644 --- a/ogr/ogrsf_frmts/arrow/ogrfeatherwriterlayer.cpp +++ b/ogr/ogrsf_frmts/arrow/ogrfeatherwriterlayer.cpp @@ -97,15 +97,18 @@ bool OGRFeatherWriterLayer::SetOptions(const std::string &osFilename, const char *pszGeomEncoding = CSLFetchNameValue(papszOptions, "GEOMETRY_ENCODING"); - m_eGeomEncoding = OGRArrowGeomEncoding::GEOARROW_GENERIC; + m_eGeomEncoding = OGRArrowGeomEncoding::GEOARROW_STRUCT_GENERIC; if (pszGeomEncoding) { if (EQUAL(pszGeomEncoding, "WKB")) m_eGeomEncoding = OGRArrowGeomEncoding::WKB; else if (EQUAL(pszGeomEncoding, "WKT")) m_eGeomEncoding = OGRArrowGeomEncoding::WKT; - else if (EQUAL(pszGeomEncoding, "GEOARROW")) - m_eGeomEncoding = OGRArrowGeomEncoding::GEOARROW_GENERIC; + else if (EQUAL(pszGeomEncoding, "GEOARROW_INTERLEAVED")) + m_eGeomEncoding = OGRArrowGeomEncoding::GEOARROW_FSL_GENERIC; + else if (EQUAL(pszGeomEncoding, "GEOARROW") || + EQUAL(pszGeomEncoding, "GEOARROW_STRUCT")) + m_eGeomEncoding = OGRArrowGeomEncoding::GEOARROW_STRUCT_GENERIC; else { CPLError(CE_Failure, CPLE_NotSupported, @@ -129,10 +132,12 @@ bool OGRFeatherWriterLayer::SetOptions(const std::string &osFilename, m_poFeatureDefn->SetGeomType(eGType); auto eGeomEncoding = m_eGeomEncoding; - if (eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_GENERIC) + if (eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_FSL_GENERIC || + eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_STRUCT_GENERIC) { - eGeomEncoding = GetPreciseArrowGeomEncoding(eGType); - if (eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_GENERIC) + const auto eEncodingType = eGeomEncoding; + eGeomEncoding = GetPreciseArrowGeomEncoding(eEncodingType, eGType); + if (eGeomEncoding == eEncodingType) return false; } m_aeGeomEncoding.push_back(eGeomEncoding); @@ -237,7 +242,7 @@ void OGRFeatherWriterLayer::CreateSchema() CPLJSONObject oColumn; oColumns.Add(poGeomFieldDefn->GetNameRef(), oColumn); oColumn.Add("encoding", - GetGeomEncodingAsString(m_aeGeomEncoding[i], true)); + GetGeomEncodingAsString(m_aeGeomEncoding[i], false)); const auto poSRS = poGeomFieldDefn->GetSpatialRef(); if (poSRS) diff --git a/ogr/ogrsf_frmts/arrow_common/ogr_arrow.h b/ogr/ogrsf_frmts/arrow_common/ogr_arrow.h index 8ec33ef4faba..0d0d784959c2 100644 --- a/ogr/ogrsf_frmts/arrow_common/ogr_arrow.h +++ b/ogr/ogrsf_frmts/arrow_common/ogr_arrow.h @@ -41,13 +41,24 @@ enum class OGRArrowGeomEncoding { WKB, WKT, - GEOARROW_GENERIC, // only used by OGRArrowWriterLayer::m_eGeomEncoding - GEOARROW_POINT, - GEOARROW_LINESTRING, - GEOARROW_POLYGON, - GEOARROW_MULTIPOINT, - GEOARROW_MULTILINESTRING, - GEOARROW_MULTIPOLYGON, + + // F(ixed) S(ize) L(ist) of (x,y[,z][,m]) values / Interleaved layout + GEOARROW_FSL_GENERIC, // only used by OGRArrowWriterLayer::m_eGeomEncoding + GEOARROW_FSL_POINT, + GEOARROW_FSL_LINESTRING, + GEOARROW_FSL_POLYGON, + GEOARROW_FSL_MULTIPOINT, + GEOARROW_FSL_MULTILINESTRING, + GEOARROW_FSL_MULTIPOLYGON, + + // Struct of (x,y,[,z][,m]) + GEOARROW_STRUCT_GENERIC, // only used by OGRArrowWriterLayer::m_eGeomEncoding + GEOARROW_STRUCT_POINT, + GEOARROW_STRUCT_LINESTRING, + GEOARROW_STRUCT_POLYGON, + GEOARROW_STRUCT_MULTIPOINT, + GEOARROW_STRUCT_MULTILINESTRING, + GEOARROW_STRUCT_MULTIPOLYGON, }; /************************************************************************/ @@ -235,6 +246,11 @@ class OGRArrowLayer CPL_NON_FINAL int GetNextArrowArray(struct ArrowArrayStream *, struct ArrowArray *out) override; + virtual void IncrFeatureIdx() + { + ++m_nFeatureIdx; + } + public: virtual ~OGRArrowLayer() override; @@ -340,6 +356,10 @@ class OGRArrowWriterLayer CPL_NON_FINAL : public OGRLayer std::vector m_aeGeomEncoding{}; int m_nWKTCoordinatePrecision = -1; + //! Base struct data type for GeoArrow struct geometry columns. + // Constraint: if not empty, m_apoBaseStructGeomType.size() == m_poFeatureDefn->GetGeomFieldCount() + std::vector> m_apoBaseStructGeomType{}; + //! Whether to use a struct field with the values of the bounding box // of the geometries. Used by Parquet. bool m_bWriteBBoxStruct = false; @@ -376,7 +396,8 @@ class OGRArrowWriterLayer CPL_NON_FINAL : public OGRLayer m_oSetWrittenGeometryTypes{}; // size: GetGeomFieldCount() static OGRArrowGeomEncoding - GetPreciseArrowGeomEncoding(OGRwkbGeometryType eGType); + GetPreciseArrowGeomEncoding(OGRArrowGeomEncoding eEncodingType, + OGRwkbGeometryType eGType); static const char * GetGeomEncodingAsString(OGRArrowGeomEncoding eGeomEncoding, bool bForParquetGeo); diff --git a/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp b/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp index d0506ebef3a2..bc34bc90b047 100644 --- a/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp +++ b/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp @@ -835,6 +835,74 @@ static bool IsListOfPointType(const std::shared_ptr &type, bHasZOut, bHasMOut); } +/************************************************************************/ +/* IsPointStructType() */ +/************************************************************************/ + +static bool IsPointStructType(const std::shared_ptr &type, + bool &bHasZOut, bool &bHasMOut) +{ + if (type->id() != arrow::Type::STRUCT) + return false; + auto poStructType = std::static_pointer_cast(type); + const int nNumFields = poStructType->num_fields(); + if (nNumFields < 2 || nNumFields > 4) + return false; + bHasZOut = false; + bHasMOut = false; + const auto poFieldX = poStructType->field(0); + if (poFieldX->name() != "x" || + poFieldX->type()->id() != arrow::Type::DOUBLE) + return false; + const auto poFieldY = poStructType->field(1); + if (poFieldY->name() != "y" || + poFieldY->type()->id() != arrow::Type::DOUBLE) + return false; + if (nNumFields == 2) + return true; + const auto poField2 = poStructType->field(2); + if (poField2->type()->id() != arrow::Type::DOUBLE) + return false; + if (poField2->name() == "z") + { + bHasZOut = true; + if (nNumFields == 4) + { + const auto poField3 = poStructType->field(3); + if (poField3->name() != "m" || + poField3->type()->id() != arrow::Type::DOUBLE) + return false; + bHasMOut = true; + } + } + else if (poField2->name() == "m") + { + bHasMOut = true; + } + else + { + return false; + } + return true; +} + +/************************************************************************/ +/* IsListOfPointStructType() */ +/************************************************************************/ + +static bool +IsListOfPointStructType(const std::shared_ptr &type, + int nDepth, bool &bHasZOut, bool &bHasMOut) +{ + if (type->id() != arrow::Type::LIST) + return false; + auto poListType = std::static_pointer_cast(type); + return nDepth == 1 + ? IsPointStructType(poListType->value_type(), bHasZOut, bHasMOut) + : IsListOfPointStructType(poListType->value_type(), nDepth - 1, + bHasZOut, bHasMOut); +} + /************************************************************************/ /* IsValidGeometryEncoding() */ /************************************************************************/ @@ -871,7 +939,7 @@ inline bool OGRArrowLayer::IsValidGeometryEncoding( CPLError(CE_Warning, CPLE_AppDefined, "Geometry column %s has a non String type: %s. " "Handling it as a regular field", - fieldName.c_str(), fieldType->name().c_str()); + fieldName.c_str(), fieldType->ToString().c_str()); return false; } eOGRArrowGeomEncodingOut = OGRArrowGeomEncoding::WKT; @@ -891,7 +959,7 @@ inline bool OGRArrowLayer::IsValidGeometryEncoding( CPLError(CE_Warning, CPLE_AppDefined, "Geometry column %s has a non Binary type: %s. " "Handling it as a regular field", - fieldName.c_str(), fieldType->name().c_str()); + fieldName.c_str(), fieldType->ToString().c_str()); return false; } eOGRArrowGeomEncodingOut = OGRArrowGeomEncoding::WKB; @@ -900,108 +968,168 @@ inline bool OGRArrowLayer::IsValidGeometryEncoding( bool bHasZ = false; bool bHasM = false; - if (osEncoding == "geoarrow.point") + if (osEncoding == "geoarrow.point" || osEncoding == "point") { - if (!IsPointType(fieldType, bHasZ, bHasM)) + if (IsPointType(fieldType, bHasZ, bHasM)) + { + eOGRArrowGeomEncodingOut = OGRArrowGeomEncoding::GEOARROW_FSL_POINT; + } + else if (IsPointStructType(fieldType, bHasZ, bHasM)) + { + eOGRArrowGeomEncodingOut = + OGRArrowGeomEncoding::GEOARROW_STRUCT_POINT; + } + else { CPLError(CE_Warning, CPLE_AppDefined, "Geometry column %s has a type != fixed_size_list[2]>: %s. " + "double>[2]> and != struct: %s. " "Handling it as a regular field", fieldName.c_str(), fieldType->name().c_str()); return false; } eGeomTypeOut = OGR_GT_SetModifier(wkbPoint, static_cast(bHasZ), static_cast(bHasM)); - eOGRArrowGeomEncodingOut = OGRArrowGeomEncoding::GEOARROW_POINT; return true; } - if (osEncoding == "geoarrow.linestring") + else if (osEncoding == "geoarrow.linestring" || osEncoding == "linestring") { - if (!IsListOfPointType(fieldType, 1, bHasZ, bHasM)) + if (IsListOfPointType(fieldType, 1, bHasZ, bHasM)) + { + eOGRArrowGeomEncodingOut = + OGRArrowGeomEncoding::GEOARROW_FSL_LINESTRING; + } + else if (IsListOfPointStructType(fieldType, 1, bHasZ, bHasM)) + { + eOGRArrowGeomEncodingOut = + OGRArrowGeomEncoding::GEOARROW_STRUCT_LINESTRING; + } + else { CPLError(CE_Warning, CPLE_AppDefined, "Geometry column %s has a type != fixed_size_list[2]>: %s. " + "double>[2]> and != list>: %s. " "Handling it as a regular field", - fieldName.c_str(), fieldType->name().c_str()); + fieldName.c_str(), fieldType->ToString().c_str()); return false; } eGeomTypeOut = OGR_GT_SetModifier( wkbLineString, static_cast(bHasZ), static_cast(bHasM)); - eOGRArrowGeomEncodingOut = OGRArrowGeomEncoding::GEOARROW_LINESTRING; return true; } - if (osEncoding == "geoarrow.polygon") + else if (osEncoding == "geoarrow.polygon" || osEncoding == "polygon") { - if (!IsListOfPointType(fieldType, 2, bHasZ, bHasM)) + if (IsListOfPointType(fieldType, 2, bHasZ, bHasM)) + { + eOGRArrowGeomEncodingOut = + OGRArrowGeomEncoding::GEOARROW_FSL_POLYGON; + } + else if (IsListOfPointStructType(fieldType, 2, bHasZ, bHasM)) + { + eOGRArrowGeomEncodingOut = + OGRArrowGeomEncoding::GEOARROW_STRUCT_POLYGON; + } + else { CPLError(CE_Warning, CPLE_AppDefined, "Geometry column %s has a type != list[2]>>: %s. " + "fixed_size_list[2]>> and != list>>: %s. " "Handling it as a regular field", - fieldName.c_str(), fieldType->name().c_str()); + fieldName.c_str(), fieldType->ToString().c_str()); return false; } eGeomTypeOut = OGR_GT_SetModifier(wkbPolygon, static_cast(bHasZ), static_cast(bHasM)); - eOGRArrowGeomEncodingOut = OGRArrowGeomEncoding::GEOARROW_POLYGON; return true; } - if (osEncoding == "geoarrow.multipoint") + else if (osEncoding == "geoarrow.multipoint" || osEncoding == "multipoint") { - if (!IsListOfPointType(fieldType, 1, bHasZ, bHasM)) + if (IsListOfPointType(fieldType, 1, bHasZ, bHasM)) + { + eOGRArrowGeomEncodingOut = + OGRArrowGeomEncoding::GEOARROW_FSL_MULTIPOINT; + } + else if (IsListOfPointStructType(fieldType, 1, bHasZ, bHasM)) + { + eOGRArrowGeomEncodingOut = + OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTIPOINT; + } + else { CPLError(CE_Warning, CPLE_AppDefined, "Geometry column %s has a type != fixed_size_list[2]>: %s. " + "double>[2]> and != list>: %s. " "Handling it as a regular field", - fieldName.c_str(), fieldType->name().c_str()); + fieldName.c_str(), fieldType->ToString().c_str()); return false; } eGeomTypeOut = OGR_GT_SetModifier( wkbMultiPoint, static_cast(bHasZ), static_cast(bHasM)); - eOGRArrowGeomEncodingOut = OGRArrowGeomEncoding::GEOARROW_MULTIPOINT; return true; } - if (osEncoding == "geoarrow.multilinestring") + else if (osEncoding == "geoarrow.multilinestring" || + osEncoding == "multilinestring") { - if (!IsListOfPointType(fieldType, 2, bHasZ, bHasM)) + if (IsListOfPointType(fieldType, 2, bHasZ, bHasM)) + { + eOGRArrowGeomEncodingOut = + OGRArrowGeomEncoding::GEOARROW_FSL_MULTILINESTRING; + } + else if (IsListOfPointStructType(fieldType, 2, bHasZ, bHasM)) + { + eOGRArrowGeomEncodingOut = + OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTILINESTRING; + } + else { CPLError(CE_Warning, CPLE_AppDefined, "Geometry column %s has a type != list[2]>>: %s. " + "fixed_size_list[2]>> and != list>>: %s. " "Handling it as a regular field", - fieldName.c_str(), fieldType->name().c_str()); + fieldName.c_str(), fieldType->ToString().c_str()); return false; } eGeomTypeOut = OGR_GT_SetModifier(wkbMultiLineString, static_cast(bHasZ), static_cast(bHasM)); - eOGRArrowGeomEncodingOut = - OGRArrowGeomEncoding::GEOARROW_MULTILINESTRING; return true; } - if (osEncoding == "geoarrow.multipolygon") + else if (osEncoding == "geoarrow.multipolygon" || + osEncoding == "multipolygon") { - if (!IsListOfPointType(fieldType, 3, bHasZ, bHasM)) + if (IsListOfPointType(fieldType, 3, bHasZ, bHasM)) + { + eOGRArrowGeomEncodingOut = + OGRArrowGeomEncoding::GEOARROW_FSL_MULTIPOLYGON; + } + else if (IsListOfPointStructType(fieldType, 3, bHasZ, bHasM)) + { + eOGRArrowGeomEncodingOut = + OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTIPOLYGON; + } + else { CPLError( CE_Warning, CPLE_AppDefined, "Geometry column %s has a type != list[2]>>>: %s. " + "list[2]>>> and != " + "list>>>: %s. " "Handling it as a regular field", - fieldName.c_str(), fieldType->name().c_str()); + fieldName.c_str(), fieldType->ToString().c_str()); return false; } eGeomTypeOut = OGR_GT_SetModifier( wkbMultiPolygon, static_cast(bHasZ), static_cast(bHasM)); - eOGRArrowGeomEncodingOut = OGRArrowGeomEncoding::GEOARROW_MULTIPOLYGON; return true; } @@ -1653,9 +1781,8 @@ static void ReadList(OGRFeature *poFeature, int i, int64_t nIdxInArray, /************************************************************************/ template -void SetPointsOfLine(OGRLineString *poLS, - const std::shared_ptr &pointValues, - int pointOffset, int numPoints) +void SetPointsOfLine(OGRLineString *poLS, const arrow::DoubleArray *pointValues, + size_t pointOffset, int numPoints) { if (!bHasZ && !bHasM) { @@ -1670,9 +1797,9 @@ void SetPointsOfLine(OGRLineString *poLS, poLS->setNumPoints(numPoints, FALSE); for (int k = 0; k < numPoints; k++) { - if (bHasZ) + if constexpr (bHasZ) { - if (bHasM) + if constexpr (bHasM) { poLS->setPoint(k, pointValues->Value(pointOffset + nDim * k), pointValues->Value(pointOffset + nDim * k + 1), @@ -1695,9 +1822,8 @@ void SetPointsOfLine(OGRLineString *poLS, } } -typedef void (*SetPointsOfLineType)(OGRLineString *, - const std::shared_ptr &, - int, int); +typedef void (*SetPointsOfLineType)(OGRLineString *, const arrow::DoubleArray *, + size_t, int); static SetPointsOfLineType GetSetPointsOfLine(bool bHasZ, bool bHasM) { @@ -1710,6 +1836,89 @@ static SetPointsOfLineType GetSetPointsOfLine(bool bHasZ, bool bHasM) return SetPointsOfLine; } +/************************************************************************/ +/* SetPointsOfLineStruct() */ +/************************************************************************/ + +template +void SetPointsOfLineStruct(OGRLineString *poLS, + const arrow::StructArray *structArray, + size_t pointOffset, int numPoints) +{ + CPLAssert(structArray->num_fields() == nDim); + const auto &fields = structArray->fields(); + const auto &fieldX = fields[0]; + CPLAssert(fieldX->type_id() == arrow::Type::DOUBLE); + const auto fieldXDouble = static_cast(fieldX.get()); + const auto &fieldY = fields[1]; + CPLAssert(fieldY->type_id() == arrow::Type::DOUBLE); + const auto fieldYDouble = static_cast(fieldY.get()); + const arrow::DoubleArray *fieldZDouble = nullptr; + const arrow::DoubleArray *fieldMDouble = nullptr; + int iField = 2; + if constexpr (bHasZ) + { + const auto &field = fields[iField]; + ++iField; + CPLAssert(field->type_id() == arrow::Type::DOUBLE); + fieldZDouble = static_cast(field.get()); + } + if constexpr (bHasM) + { + const auto &field = fields[iField]; + CPLAssert(field->type_id() == arrow::Type::DOUBLE); + fieldMDouble = static_cast(field.get()); + } + + poLS->setNumPoints(numPoints, FALSE); + for (int k = 0; k < numPoints; k++) + { + if constexpr (bHasZ) + { + if constexpr (bHasM) + { + poLS->setPoint(k, fieldXDouble->Value(pointOffset + k), + fieldYDouble->Value(pointOffset + k), + fieldZDouble->Value(pointOffset + k), + fieldMDouble->Value(pointOffset + k)); + } + else + { + poLS->setPoint(k, fieldXDouble->Value(pointOffset + k), + fieldYDouble->Value(pointOffset + k), + fieldZDouble->Value(pointOffset + k)); + } + } + else if constexpr (bHasM) + { + poLS->setPointM(k, fieldXDouble->Value(pointOffset + k), + fieldYDouble->Value(pointOffset + k), + fieldMDouble->Value(pointOffset + k)); + } + else + { + poLS->setPoint(k, fieldXDouble->Value(pointOffset + k), + fieldYDouble->Value(pointOffset + k)); + } + } +} + +typedef void (*SetPointsOfLineStructType)(OGRLineString *, + const arrow::StructArray *, size_t, + int); + +static SetPointsOfLineStructType GetSetPointsOfLineStruct(bool bHasZ, + bool bHasM) +{ + if (bHasZ && bHasM) + return SetPointsOfLineStruct; + if (bHasZ) + return SetPointsOfLineStruct; + if (bHasM) + return SetPointsOfLineStruct; + return SetPointsOfLineStruct; +} + /************************************************************************/ /* TimestampToOGR() */ /************************************************************************/ @@ -2231,8 +2440,7 @@ inline OGRGeometry *OGRArrowLayer::ReadGeometry(int iGeomField, const int nDim = 2 + (bHasZ ? 1 : 0) + (bHasM ? 1 : 0); const auto CreatePoint = - [bHasZ, bHasM](const std::shared_ptr &pointValues, - int pointOffset) + [bHasZ, bHasM](const arrow::DoubleArray *pointValues, int pointOffset) { if (bHasZ) { @@ -2263,6 +2471,76 @@ inline OGRGeometry *OGRArrowLayer::ReadGeometry(int iGeomField, } }; + const auto CreateStructPoint = + [nDim, bHasZ, bHasM](const arrow::StructArray *structArray, + int64_t pointOffset) + { + CPL_IGNORE_RET_VAL(nDim); + CPLAssert(structArray->num_fields() == nDim); + const auto &fieldX = structArray->field(0); + CPLAssert(fieldX->type_id() == arrow::Type::DOUBLE); + const auto fieldXDouble = + static_cast(fieldX.get()); + const auto &fieldY = structArray->field(1); + CPLAssert(fieldY->type_id() == arrow::Type::DOUBLE); + const auto fieldYDouble = + static_cast(fieldY.get()); + if (bHasZ) + { + const auto &fieldZ = structArray->field(2); + CPLAssert(fieldZ->type_id() == arrow::Type::DOUBLE); + const auto fieldZDouble = + static_cast(fieldZ.get()); + if (bHasM) + { + const auto &fieldM = structArray->field(3); + CPLAssert(fieldM->type_id() == arrow::Type::DOUBLE); + const auto fieldMDouble = + static_cast(fieldM.get()); + return new OGRPoint(fieldXDouble->Value(pointOffset), + fieldYDouble->Value(pointOffset), + fieldZDouble->Value(pointOffset), + fieldMDouble->Value(pointOffset)); + } + else + { + return new OGRPoint(fieldXDouble->Value(pointOffset), + fieldYDouble->Value(pointOffset), + fieldZDouble->Value(pointOffset)); + } + } + else if (bHasM) + { + const auto &fieldM = structArray->field(2); + CPLAssert(fieldM->type_id() == arrow::Type::DOUBLE); + const auto fieldMDouble = + static_cast(fieldM.get()); + return OGRPoint::createXYM(fieldXDouble->Value(pointOffset), + fieldYDouble->Value(pointOffset), + fieldMDouble->Value(pointOffset)); + } + else + { + return new OGRPoint(fieldXDouble->Value(pointOffset), + fieldYDouble->Value(pointOffset)); + } + }; + + // Arrow 14 since https://github.com/apache/arrow/commit/95a8bfb319b2729c8f6daa069433caba3b4ddddd + // returns reference to shared pointers, so we can safely take the raw pointer + // and cast it. + // Earlier versions returned a non-reference shared pointer, so formally it + // is safer to use static_pointer_cast (although in practice given that + // "values" is a member variable), the Arrow >= 14 path might work... +#if ARROW_VERSION_MAJOR >= 14 +#define GET_PTR_FROM_VALUES(var, type, values) \ + const auto var = static_cast((values).get()) +#else +#define GET_PTR_FROM_VALUES(var, type, values) \ + const auto var##tmp = std::static_pointer_cast(values); \ + const auto var = var##tmp.get() +#endif + switch (m_aeGeomEncoding[iGeomField]) { case OGRArrowGeomEncoding::WKB: @@ -2330,21 +2608,21 @@ inline OGRGeometry *OGRArrowLayer::ReadGeometry(int iGeomField, break; } - case OGRArrowGeomEncoding::GEOARROW_GENERIC: + case OGRArrowGeomEncoding::GEOARROW_FSL_GENERIC: + case OGRArrowGeomEncoding::GEOARROW_STRUCT_GENERIC: { CPLAssert(false); break; } - case OGRArrowGeomEncoding::GEOARROW_POINT: + case OGRArrowGeomEncoding::GEOARROW_FSL_POINT: { CPLAssert(array->type_id() == arrow::Type::FIXED_SIZE_LIST); const auto listArray = static_cast(array); CPLAssert(listArray->values()->type_id() == arrow::Type::DOUBLE); - const auto pointValues = - std::static_pointer_cast( - listArray->values()); + GET_PTR_FROM_VALUES(pointValues, arrow::DoubleArray, + listArray->values()); if (!pointValues->IsNull(nDim * nIdxInBatch)) { poGeometry = CreatePoint(pointValues, @@ -2355,20 +2633,18 @@ inline OGRGeometry *OGRArrowLayer::ReadGeometry(int iGeomField, break; } - case OGRArrowGeomEncoding::GEOARROW_LINESTRING: + case OGRArrowGeomEncoding::GEOARROW_FSL_LINESTRING: { CPLAssert(array->type_id() == arrow::Type::LIST); const auto listArray = static_cast(array); CPLAssert(listArray->values()->type_id() == arrow::Type::FIXED_SIZE_LIST); - const auto listOfPointsValues = - std::static_pointer_cast( - listArray->values()); + GET_PTR_FROM_VALUES(listOfPointsValues, arrow::FixedSizeListArray, + listArray->values()); CPLAssert(listOfPointsValues->values()->type_id() == arrow::Type::DOUBLE); - const auto pointValues = - std::static_pointer_cast( - listOfPointsValues->values()); + GET_PTR_FROM_VALUES(pointValues, arrow::DoubleArray, + listOfPointsValues->values()); const auto nPoints = listArray->value_length(nIdxInBatch); const auto nPointOffset = listArray->value_offset(nIdxInBatch) * nDim; @@ -2389,26 +2665,23 @@ inline OGRGeometry *OGRArrowLayer::ReadGeometry(int iGeomField, break; } - case OGRArrowGeomEncoding::GEOARROW_POLYGON: + case OGRArrowGeomEncoding::GEOARROW_FSL_POLYGON: { CPLAssert(array->type_id() == arrow::Type::LIST); const auto listOfRingsArray = static_cast(array); CPLAssert(listOfRingsArray->values()->type_id() == arrow::Type::LIST); - const auto listOfRingsValues = - std::static_pointer_cast( - listOfRingsArray->values()); + GET_PTR_FROM_VALUES(listOfRingsValues, arrow::ListArray, + listOfRingsArray->values()); CPLAssert(listOfRingsValues->values()->type_id() == arrow::Type::FIXED_SIZE_LIST); - const auto listOfPointsValues = - std::static_pointer_cast( - listOfRingsValues->values()); + GET_PTR_FROM_VALUES(listOfPointsValues, arrow::FixedSizeListArray, + listOfRingsValues->values()); CPLAssert(listOfPointsValues->values()->type_id() == arrow::Type::DOUBLE); - const auto pointValues = - std::static_pointer_cast( - listOfPointsValues->values()); + GET_PTR_FROM_VALUES(pointValues, arrow::DoubleArray, + listOfPointsValues->values()); const auto setPointsFun = GetSetPointsOfLine(bHasZ, bHasM); const auto nRings = listOfRingsArray->value_length(nIdxInBatch); const auto nRingOffset = @@ -2438,20 +2711,18 @@ inline OGRGeometry *OGRArrowLayer::ReadGeometry(int iGeomField, break; } - case OGRArrowGeomEncoding::GEOARROW_MULTIPOINT: + case OGRArrowGeomEncoding::GEOARROW_FSL_MULTIPOINT: { CPLAssert(array->type_id() == arrow::Type::LIST); const auto listArray = static_cast(array); CPLAssert(listArray->values()->type_id() == arrow::Type::FIXED_SIZE_LIST); - const auto listOfPointsValues = - std::static_pointer_cast( - listArray->values()); + GET_PTR_FROM_VALUES(listOfPointsValues, arrow::FixedSizeListArray, + listArray->values()); CPLAssert(listOfPointsValues->values()->type_id() == arrow::Type::DOUBLE); - const auto pointValues = - std::static_pointer_cast( - listOfPointsValues->values()); + GET_PTR_FROM_VALUES(pointValues, arrow::DoubleArray, + listOfPointsValues->values()); const auto nPoints = listArray->value_length(nIdxInBatch); const auto nPointOffset = listArray->value_offset(nIdxInBatch) * nDim; @@ -2472,26 +2743,23 @@ inline OGRGeometry *OGRArrowLayer::ReadGeometry(int iGeomField, break; } - case OGRArrowGeomEncoding::GEOARROW_MULTILINESTRING: + case OGRArrowGeomEncoding::GEOARROW_FSL_MULTILINESTRING: { CPLAssert(array->type_id() == arrow::Type::LIST); const auto listOfStringsArray = static_cast(array); CPLAssert(listOfStringsArray->values()->type_id() == arrow::Type::LIST); - const auto listOfStringsValues = - std::static_pointer_cast( - listOfStringsArray->values()); + GET_PTR_FROM_VALUES(listOfStringsValues, arrow::ListArray, + listOfStringsArray->values()); CPLAssert(listOfStringsValues->values()->type_id() == arrow::Type::FIXED_SIZE_LIST); - const auto listOfPointsValues = - std::static_pointer_cast( - listOfStringsValues->values()); + GET_PTR_FROM_VALUES(listOfPointsValues, arrow::FixedSizeListArray, + listOfStringsValues->values()); CPLAssert(listOfPointsValues->values()->type_id() == arrow::Type::DOUBLE); - const auto pointValues = - std::static_pointer_cast( - listOfPointsValues->values()); + GET_PTR_FROM_VALUES(pointValues, arrow::DoubleArray, + listOfPointsValues->values()); const auto setPointsFun = GetSetPointsOfLine(bHasZ, bHasM); const auto nStrings = listOfStringsArray->value_length(nIdxInBatch); const auto nRingOffset = @@ -2521,31 +2789,27 @@ inline OGRGeometry *OGRArrowLayer::ReadGeometry(int iGeomField, break; } - case OGRArrowGeomEncoding::GEOARROW_MULTIPOLYGON: + case OGRArrowGeomEncoding::GEOARROW_FSL_MULTIPOLYGON: { CPLAssert(array->type_id() == arrow::Type::LIST); const auto listOfPartsArray = static_cast(array); CPLAssert(listOfPartsArray->values()->type_id() == arrow::Type::LIST); - const auto listOfPartsValues = - std::static_pointer_cast( - listOfPartsArray->values()); + GET_PTR_FROM_VALUES(listOfPartsValues, arrow::ListArray, + listOfPartsArray->values()); CPLAssert(listOfPartsValues->values()->type_id() == arrow::Type::LIST); - const auto listOfRingsValues = - std::static_pointer_cast( - listOfPartsValues->values()); + GET_PTR_FROM_VALUES(listOfRingsValues, arrow::ListArray, + listOfPartsValues->values()); CPLAssert(listOfRingsValues->values()->type_id() == arrow::Type::FIXED_SIZE_LIST); - const auto listOfPointsValues = - std::static_pointer_cast( - listOfRingsValues->values()); + GET_PTR_FROM_VALUES(listOfPointsValues, arrow::FixedSizeListArray, + listOfRingsValues->values()); CPLAssert(listOfPointsValues->values()->type_id() == arrow::Type::DOUBLE); - const auto pointValues = - std::static_pointer_cast( - listOfPointsValues->values()); + GET_PTR_FROM_VALUES(pointValues, arrow::DoubleArray, + listOfPointsValues->values()); auto poMP = new OGRMultiPolygon(); poGeometry = poMP; poGeometry->assignSpatialReference( @@ -2584,6 +2848,212 @@ inline OGRGeometry *OGRArrowLayer::ReadGeometry(int iGeomField, } break; } + + case OGRArrowGeomEncoding::GEOARROW_STRUCT_POINT: + { + CPLAssert(array->type_id() == arrow::Type::STRUCT); + const auto structArray = + static_cast(array); + if (!structArray->IsNull(nIdxInBatch)) + { + poGeometry = CreateStructPoint(structArray, nIdxInBatch); + poGeometry->assignSpatialReference( + poGeomFieldDefn->GetSpatialRef()); + } + break; + } + + case OGRArrowGeomEncoding::GEOARROW_STRUCT_LINESTRING: + { + CPLAssert(array->type_id() == arrow::Type::LIST); + const auto listArray = static_cast(array); + CPLAssert(listArray->values()->type_id() == arrow::Type::STRUCT); + GET_PTR_FROM_VALUES(pointValues, arrow::StructArray, + listArray->values()); + const auto nPoints = listArray->value_length(nIdxInBatch); + const auto nPointOffset = listArray->value_offset(nIdxInBatch); + auto poLineString = new OGRLineString(); + poGeometry = poLineString; + poGeometry->assignSpatialReference( + poGeomFieldDefn->GetSpatialRef()); + if (nPoints) + { + GetSetPointsOfLineStruct(bHasZ, bHasM)( + poLineString, pointValues, nPointOffset, nPoints); + } + else + { + poGeometry->set3D(bHasZ); + poGeometry->setMeasured(bHasM); + } + break; + } + + case OGRArrowGeomEncoding::GEOARROW_STRUCT_POLYGON: + { + CPLAssert(array->type_id() == arrow::Type::LIST); + const auto listOfRingsArray = + static_cast(array); + CPLAssert(listOfRingsArray->values()->type_id() == + arrow::Type::LIST); + GET_PTR_FROM_VALUES(listOfRingsValues, arrow::ListArray, + listOfRingsArray->values()); + CPLAssert(listOfRingsValues->values()->type_id() == + arrow::Type::STRUCT); + GET_PTR_FROM_VALUES(pointValues, arrow::StructArray, + listOfRingsValues->values()); + const auto setPointsFun = GetSetPointsOfLineStruct(bHasZ, bHasM); + const auto nRings = listOfRingsArray->value_length(nIdxInBatch); + const auto nRingOffset = + listOfRingsArray->value_offset(nIdxInBatch); + auto poPoly = new OGRPolygon(); + poGeometry = poPoly; + poGeometry->assignSpatialReference( + poGeomFieldDefn->GetSpatialRef()); + for (auto k = decltype(nRings){0}; k < nRings; k++) + { + const auto nPoints = + listOfRingsValues->value_length(nRingOffset + k); + const auto nPointOffset = + listOfRingsValues->value_offset(nRingOffset + k); + auto poRing = new OGRLinearRing(); + if (nPoints) + { + setPointsFun(poRing, pointValues, nPointOffset, nPoints); + } + poPoly->addRingDirectly(poRing); + } + if (poGeometry->IsEmpty()) + { + poGeometry->set3D(bHasZ); + poGeometry->setMeasured(bHasM); + } + break; + } + + case OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTIPOINT: + { + CPLAssert(array->type_id() == arrow::Type::LIST); + const auto listArray = static_cast(array); + CPLAssert(listArray->values()->type_id() == arrow::Type::STRUCT); + GET_PTR_FROM_VALUES(pointValues, arrow::StructArray, + listArray->values()); + const auto nPoints = listArray->value_length(nIdxInBatch); + const auto nPointOffset = listArray->value_offset(nIdxInBatch); + auto poMultiPoint = new OGRMultiPoint(); + poGeometry = poMultiPoint; + poGeometry->assignSpatialReference( + poGeomFieldDefn->GetSpatialRef()); + for (auto k = decltype(nPoints){0}; k < nPoints; k++) + { + poMultiPoint->addGeometryDirectly( + CreateStructPoint(pointValues, nPointOffset + k)); + } + if (poGeometry->IsEmpty()) + { + poGeometry->set3D(bHasZ); + poGeometry->setMeasured(bHasM); + } + break; + } + + case OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTILINESTRING: + { + CPLAssert(array->type_id() == arrow::Type::LIST); + const auto listOfStringsArray = + static_cast(array); + CPLAssert(listOfStringsArray->values()->type_id() == + arrow::Type::LIST); + GET_PTR_FROM_VALUES(listOfStringsValues, arrow::ListArray, + listOfStringsArray->values()); + CPLAssert(listOfStringsValues->values()->type_id() == + arrow::Type::STRUCT); + GET_PTR_FROM_VALUES(pointValues, arrow::StructArray, + listOfStringsValues->values()); + const auto setPointsFun = GetSetPointsOfLineStruct(bHasZ, bHasM); + const auto nStrings = listOfStringsArray->value_length(nIdxInBatch); + const auto nRingOffset = + listOfStringsArray->value_offset(nIdxInBatch); + auto poMLS = new OGRMultiLineString(); + poGeometry = poMLS; + poGeometry->assignSpatialReference( + poGeomFieldDefn->GetSpatialRef()); + for (auto k = decltype(nStrings){0}; k < nStrings; k++) + { + const auto nPoints = + listOfStringsValues->value_length(nRingOffset + k); + const auto nPointOffset = + listOfStringsValues->value_offset(nRingOffset + k); + auto poLS = new OGRLineString(); + if (nPoints) + { + setPointsFun(poLS, pointValues, nPointOffset, nPoints); + } + poMLS->addGeometryDirectly(poLS); + } + if (poGeometry->IsEmpty()) + { + poGeometry->set3D(bHasZ); + poGeometry->setMeasured(bHasM); + } + break; + } + + case OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTIPOLYGON: + { + CPLAssert(array->type_id() == arrow::Type::LIST); + const auto listOfPartsArray = + static_cast(array); + CPLAssert(listOfPartsArray->values()->type_id() == + arrow::Type::LIST); + GET_PTR_FROM_VALUES(listOfPartsValues, arrow::ListArray, + listOfPartsArray->values()); + CPLAssert(listOfPartsValues->values()->type_id() == + arrow::Type::LIST); + GET_PTR_FROM_VALUES(listOfRingsValues, arrow::ListArray, + listOfPartsValues->values()); + CPLAssert(listOfRingsValues->values()->type_id() == + arrow::Type::STRUCT); + GET_PTR_FROM_VALUES(pointValues, arrow::StructArray, + listOfRingsValues->values()); + auto poMP = new OGRMultiPolygon(); + poGeometry = poMP; + poGeometry->assignSpatialReference( + poGeomFieldDefn->GetSpatialRef()); + const auto setPointsFun = GetSetPointsOfLineStruct(bHasZ, bHasM); + const auto nParts = listOfPartsArray->value_length(nIdxInBatch); + const auto nPartOffset = + listOfPartsArray->value_offset(nIdxInBatch); + for (auto j = decltype(nParts){0}; j < nParts; j++) + { + const auto nRings = + listOfPartsValues->value_length(nPartOffset + j); + const auto nRingOffset = + listOfPartsValues->value_offset(nPartOffset + j); + auto poPoly = new OGRPolygon(); + for (auto k = decltype(nRings){0}; k < nRings; k++) + { + const auto nPoints = + listOfRingsValues->value_length(nRingOffset + k); + const auto nPointOffset = + listOfRingsValues->value_offset(nRingOffset + k); + auto poRing = new OGRLinearRing(); + if (nPoints) + { + setPointsFun(poRing, pointValues, nPointOffset, + nPoints); + } + poPoly->addRingDirectly(poRing); + } + poMP->addGeometryDirectly(poPoly); + } + if (poGeometry->IsEmpty()) + { + poGeometry->set3D(bHasZ); + poGeometry->setMeasured(bHasM); + } + break; + } } return poGeometry; } @@ -3468,11 +3938,11 @@ inline OGRFeature *OGRArrowLayer::GetNextRawFeature() iCol = m_anMapGeomFieldIndexToArrowColumn[m_iGeomFieldFilter]; } - OGREnvelope sEnvelopeSkipToNextFeatureDueToBBOX; - const auto SkipToNextFeatureDueToBBOX = - [this, &sEnvelopeSkipToNextFeatureDueToBBOX]() + if (m_poArrayXMinFloat || m_poArrayXMinDouble) { - if (!m_poArrayBBOX || !m_poArrayBBOX->IsNull(m_nIdxInBatch)) + OGREnvelope sEnvelopeSkipToNextFeatureDueToBBOX; + const auto IntersectsBBOX = + [this, &sEnvelopeSkipToNextFeatureDueToBBOX]() { if (m_poArrayXMinFloat && !m_poArrayXMinFloat->IsNull(m_nIdxInBatch)) @@ -3485,7 +3955,7 @@ inline OGRFeature *OGRArrowLayer::GetNextRawFeature() m_poArrayXMaxFloat->Value(m_nIdxInBatch); sEnvelopeSkipToNextFeatureDueToBBOX.MaxY = m_poArrayYMaxFloat->Value(m_nIdxInBatch); - if (!m_sFilterEnvelope.Intersects( + if (m_sFilterEnvelope.Intersects( sEnvelopeSkipToNextFeatureDueToBBOX)) { return true; @@ -3502,46 +3972,60 @@ inline OGRFeature *OGRArrowLayer::GetNextRawFeature() m_poArrayXMaxDouble->Value(m_nIdxInBatch); sEnvelopeSkipToNextFeatureDueToBBOX.MaxY = m_poArrayYMaxDouble->Value(m_nIdxInBatch); - if (!m_sFilterEnvelope.Intersects( + if (m_sFilterEnvelope.Intersects( sEnvelopeSkipToNextFeatureDueToBBOX)) { return true; } } - } - return false; - }; + return false; + }; - if (iCol >= 0 && - m_aeGeomEncoding[m_iGeomFieldFilter] == OGRArrowGeomEncoding::WKB) + while (true) + { + if (!m_poArrayBBOX->IsNull(m_nIdxInBatch) && IntersectsBBOX() && + (m_asAttributeFilterConstraints.empty() || + !SkipToNextFeatureDueToAttributeFilter())) + { + break; + } + + IncrFeatureIdx(); + m_nIdxInBatch++; + if (m_nIdxInBatch == m_poBatch->num_rows()) + { + m_bEOF = !ReadNextBatch(); + if (m_bEOF) + return nullptr; + } + } + } + else if (iCol >= 0 && m_aeGeomEncoding[m_iGeomFieldFilter] == + OGRArrowGeomEncoding::WKB) { CPLAssert(m_poArrayWKB || m_poArrayWKBLarge); OGREnvelope sEnvelope; while (true) { - bool bSkipToNextFeature = false; + bool bMatchBBOX = false; if ((m_poArrayWKB && m_poArrayWKB->IsNull(m_nIdxInBatch)) || (m_poArrayWKBLarge && m_poArrayWKBLarge->IsNull(m_nIdxInBatch))) { - bSkipToNextFeature = true; + // nothing to do } else { - if (m_poArrayXMinFloat || m_poArrayXMinDouble) - { - bSkipToNextFeature = SkipToNextFeatureDueToBBOX(); - } - else if (m_poArrayWKB) + if (m_poArrayWKB) { int out_length = 0; const uint8_t *data = m_poArrayWKB->GetValue(m_nIdxInBatch, &out_length); if (OGRWKBGetBoundingBox(data, out_length, sEnvelope) && - !m_sFilterEnvelope.Intersects(sEnvelope)) + m_sFilterEnvelope.Intersects(sEnvelope)) { - bSkipToNextFeature = true; + bMatchBBOX = true; } } else @@ -3554,23 +4038,19 @@ inline OGRFeature *OGRArrowLayer::GetNextRawFeature() OGRWKBGetBoundingBox(data, static_cast(out_length64), sEnvelope) && - !m_sFilterEnvelope.Intersects(sEnvelope)) + m_sFilterEnvelope.Intersects(sEnvelope)) { - bSkipToNextFeature = true; + bMatchBBOX = true; } } } - if (!bSkipToNextFeature) - { - break; - } - if (!m_asAttributeFilterConstraints.empty() && - !SkipToNextFeatureDueToAttributeFilter()) + if (bMatchBBOX && (m_asAttributeFilterConstraints.empty() || + !SkipToNextFeatureDueToAttributeFilter())) { break; } - m_nFeatureIdx++; + IncrFeatureIdx(); m_nIdxInBatch++; if (m_nIdxInBatch == m_poBatch->num_rows()) { @@ -3580,8 +4060,9 @@ inline OGRFeature *OGRArrowLayer::GetNextRawFeature() } } } - else if (iCol >= 0 && m_aeGeomEncoding[m_iGeomFieldFilter] == - OGRArrowGeomEncoding::GEOARROW_MULTIPOLYGON) + else if (iCol >= 0 && + m_aeGeomEncoding[m_iGeomFieldFilter] == + OGRArrowGeomEncoding::GEOARROW_FSL_MULTIPOLYGON) { const auto poGeomFieldDefn = m_poFeatureDefn->GetGeomFieldDefn(m_iGeomFieldFilter); @@ -3590,133 +4071,603 @@ inline OGRFeature *OGRArrowLayer::GetNextRawFeature() const bool bHasM = CPL_TO_BOOL(OGR_GT_HasM(eGeomType)); const int nDim = 2 + (bHasZ ? 1 : 0) + (bHasM ? 1 : 0); - begin_multipolygon: - auto array = m_poBatchColumns[iCol].get(); - CPLAssert(array->type_id() == arrow::Type::LIST); - auto listOfPartsArray = - static_cast(array); - CPLAssert(listOfPartsArray->values()->type_id() == - arrow::Type::LIST); - auto listOfPartsValues = std::static_pointer_cast( - listOfPartsArray->values()); - CPLAssert(listOfPartsValues->values()->type_id() == - arrow::Type::LIST); - auto listOfRingsValues = std::static_pointer_cast( - listOfPartsValues->values()); - CPLAssert(listOfRingsValues->values()->type_id() == - arrow::Type::FIXED_SIZE_LIST); - auto listOfPointsValues = - std::static_pointer_cast( - listOfRingsValues->values()); - CPLAssert(listOfPointsValues->values()->type_id() == - arrow::Type::DOUBLE); - auto pointValues = std::static_pointer_cast( - listOfPointsValues->values()); + bool bReturnFeature; + do + { + bReturnFeature = false; + auto array = m_poBatchColumns[iCol].get(); + CPLAssert(array->type_id() == arrow::Type::LIST); + auto listOfPartsArray = + static_cast(array); + CPLAssert(listOfPartsArray->values()->type_id() == + arrow::Type::LIST); + auto listOfPartsValues = + std::static_pointer_cast( + listOfPartsArray->values()); + CPLAssert(listOfPartsValues->values()->type_id() == + arrow::Type::LIST); + auto listOfRingsValues = + std::static_pointer_cast( + listOfPartsValues->values()); + CPLAssert(listOfRingsValues->values()->type_id() == + arrow::Type::FIXED_SIZE_LIST); + auto listOfPointsValues = + std::static_pointer_cast( + listOfRingsValues->values()); + CPLAssert(listOfPointsValues->values()->type_id() == + arrow::Type::DOUBLE); + auto pointValues = std::static_pointer_cast( + listOfPointsValues->values()); - while (true) + while (true) + { + bool bMatchBBOX = false; + if (!listOfPartsArray->IsNull(m_nIdxInBatch)) + { + OGREnvelope sEnvelope; + const auto nParts = + listOfPartsArray->value_length(m_nIdxInBatch); + const auto nPartOffset = + listOfPartsArray->value_offset(m_nIdxInBatch); + for (auto j = decltype(nParts){0}; j < nParts; j++) + { + const auto nRings = listOfPartsValues->value_length( + nPartOffset + j); + const auto nRingOffset = + listOfPartsValues->value_offset(nPartOffset + + j); + if (nRings >= 1) + { + const auto nPoints = + listOfRingsValues->value_length( + nRingOffset); + const auto nPointOffset = + listOfRingsValues->value_offset( + nRingOffset) * + nDim; + const double *padfRawValue = + pointValues->raw_values() + nPointOffset; + for (auto l = decltype(nPoints){0}; l < nPoints; + ++l) + { + sEnvelope.Merge(padfRawValue[nDim * l], + padfRawValue[nDim * l + 1]); + } + // for bounding box, only the first ring matters + } + } + + if (nParts != 0 && + m_sFilterEnvelope.Intersects(sEnvelope)) + { + bMatchBBOX = true; + } + } + if (bMatchBBOX && + (m_asAttributeFilterConstraints.empty() || + !SkipToNextFeatureDueToAttributeFilter())) + { + bReturnFeature = true; + break; + } + + IncrFeatureIdx(); + m_nIdxInBatch++; + if (m_nIdxInBatch == m_poBatch->num_rows()) + { + m_bEOF = !ReadNextBatch(); + if (m_bEOF) + return nullptr; + break; + } + } + } while (!bReturnFeature); + } + else if (iCol >= 0 && m_aeGeomEncoding[m_iGeomFieldFilter] == + OGRArrowGeomEncoding::GEOARROW_STRUCT_POINT) + { + bool bReturnFeature; + do + { + bReturnFeature = false; + auto array = m_poBatchColumns[iCol].get(); + CPLAssert(array->type_id() == arrow::Type::STRUCT); + auto pointValues = + static_cast(array); + const auto &fields = pointValues->fields(); + const auto &fieldX = fields[0]; + CPLAssert(fieldX->type_id() == arrow::Type::DOUBLE); + const auto fieldXDouble = + static_cast(fieldX.get()); + const auto &fieldY = fields[1]; + CPLAssert(fieldY->type_id() == arrow::Type::DOUBLE); + const auto fieldYDouble = + static_cast(fieldY.get()); + + while (true) + { + bool bMatchBBOX = false; + if (!array->IsNull(m_nIdxInBatch)) + { + const double dfX = fieldXDouble->Value(m_nIdxInBatch); + const double dfY = fieldYDouble->Value(m_nIdxInBatch); + if (dfX >= m_sFilterEnvelope.MinX && + dfY >= m_sFilterEnvelope.MinY && + dfX <= m_sFilterEnvelope.MaxX && + dfY <= m_sFilterEnvelope.MaxY) + { + bMatchBBOX = true; + } + } + if (bMatchBBOX && + (m_asAttributeFilterConstraints.empty() || + !SkipToNextFeatureDueToAttributeFilter())) + { + bReturnFeature = true; + break; + } + + IncrFeatureIdx(); + m_nIdxInBatch++; + if (m_nIdxInBatch == m_poBatch->num_rows()) + { + m_bEOF = !ReadNextBatch(); + if (m_bEOF) + return nullptr; + break; + } + } + } while (!bReturnFeature); + } + else if (iCol >= 0 && + m_aeGeomEncoding[m_iGeomFieldFilter] == + OGRArrowGeomEncoding::GEOARROW_STRUCT_LINESTRING) + { + bool bReturnFeature; + do { - bool bSkipToNextFeature = false; - if (m_poArrayXMinFloat || m_poArrayXMinDouble) + bReturnFeature = false; + auto array = m_poBatchColumns[iCol].get(); + CPLAssert(array->type_id() == arrow::Type::LIST); + const auto listArray = + static_cast(array); + CPLAssert(listArray->values()->type_id() == + arrow::Type::STRUCT); + auto pointValues = std::static_pointer_cast( + listArray->values()); + const auto &fields = pointValues->fields(); + const auto &fieldX = fields[0]; + CPLAssert(fieldX->type_id() == arrow::Type::DOUBLE); + const auto fieldXDouble = + static_cast(fieldX.get()); + const auto &fieldY = fields[1]; + CPLAssert(fieldY->type_id() == arrow::Type::DOUBLE); + const auto fieldYDouble = + static_cast(fieldY.get()); + + while (true) { - bSkipToNextFeature = SkipToNextFeatureDueToBBOX(); + bool bMatchBBOX = false; + if (!listArray->IsNull(m_nIdxInBatch)) + { + OGREnvelope sEnvelope; + const auto nPoints = + listArray->value_length(m_nIdxInBatch); + const auto nPointOffset = + listArray->value_offset(m_nIdxInBatch); + if (nPoints > 0) + { + const double *padfRawXValue = + fieldXDouble->raw_values() + nPointOffset; + const double *padfRawYValue = + fieldYDouble->raw_values() + nPointOffset; + for (auto l = decltype(nPoints){0}; l < nPoints; + ++l) + { + sEnvelope.Merge(padfRawXValue[l], + padfRawYValue[l]); + } + if (m_sFilterEnvelope.Intersects(sEnvelope)) + { + bMatchBBOX = true; + } + } + } + if (bMatchBBOX && + (m_asAttributeFilterConstraints.empty() || + !SkipToNextFeatureDueToAttributeFilter())) + { + bReturnFeature = true; + break; + } + + IncrFeatureIdx(); + m_nIdxInBatch++; + if (m_nIdxInBatch == m_poBatch->num_rows()) + { + m_bEOF = !ReadNextBatch(); + if (m_bEOF) + return nullptr; + break; + } } - else if (!listOfPartsArray->IsNull(m_nIdxInBatch)) + } while (!bReturnFeature); + } + else if (iCol >= 0 && m_aeGeomEncoding[m_iGeomFieldFilter] == + OGRArrowGeomEncoding::GEOARROW_STRUCT_POLYGON) + { + bool bReturnFeature; + do + { + bReturnFeature = false; + auto array = m_poBatchColumns[iCol].get(); + CPLAssert(array->type_id() == arrow::Type::LIST); + const auto listOfRingsArray = + static_cast(array); + CPLAssert(listOfRingsArray->values()->type_id() == + arrow::Type::LIST); + const auto listOfRingsValues = + std::static_pointer_cast( + listOfRingsArray->values()); + CPLAssert(listOfRingsValues->values()->type_id() == + arrow::Type::STRUCT); + auto pointValues = std::static_pointer_cast( + listOfRingsValues->values()); + const auto &fields = pointValues->fields(); + const auto &fieldX = fields[0]; + CPLAssert(fieldX->type_id() == arrow::Type::DOUBLE); + const auto fieldXDouble = + static_cast(fieldX.get()); + const auto &fieldY = fields[1]; + CPLAssert(fieldY->type_id() == arrow::Type::DOUBLE); + const auto fieldYDouble = + static_cast(fieldY.get()); + + while (true) { - OGREnvelope sEnvelope; - const auto nParts = - listOfPartsArray->value_length(m_nIdxInBatch); - const auto nPartOffset = - listOfPartsArray->value_offset(m_nIdxInBatch); - for (auto j = decltype(nParts){0}; j < nParts; j++) + bool bMatchBBOX = false; + if (!listOfRingsArray->IsNull(m_nIdxInBatch)) { + OGREnvelope sEnvelope; const auto nRings = - listOfPartsValues->value_length(nPartOffset + j); + listOfRingsArray->value_length(m_nIdxInBatch); const auto nRingOffset = - listOfPartsValues->value_offset(nPartOffset + j); + listOfRingsArray->value_offset(m_nIdxInBatch); if (nRings >= 1) { const auto nPoints = listOfRingsValues->value_length(nRingOffset); const auto nPointOffset = - listOfRingsValues->value_offset(nRingOffset) * - nDim; - const double *padfRawValue = - pointValues->raw_values() + nPointOffset; + listOfRingsValues->value_offset(nRingOffset); + const double *padfRawXValue = + fieldXDouble->raw_values() + nPointOffset; + const double *padfRawYValue = + fieldYDouble->raw_values() + nPointOffset; for (auto l = decltype(nPoints){0}; l < nPoints; ++l) { - sEnvelope.Merge(padfRawValue[nDim * l], - padfRawValue[nDim * l + 1]); + sEnvelope.Merge(padfRawXValue[l], + padfRawYValue[l]); } // for bounding box, only the first ring matters + + if (m_sFilterEnvelope.Intersects(sEnvelope)) + { + bMatchBBOX = true; + } } } + if (bMatchBBOX && + (m_asAttributeFilterConstraints.empty() || + !SkipToNextFeatureDueToAttributeFilter())) + { + bReturnFeature = true; + break; + } - if (nParts != 0 && m_sFilterEnvelope.Intersects(sEnvelope)) + IncrFeatureIdx(); + m_nIdxInBatch++; + if (m_nIdxInBatch == m_poBatch->num_rows()) { - bSkipToNextFeature = true; + m_bEOF = !ReadNextBatch(); + if (m_bEOF) + return nullptr; + break; } } - if (!bSkipToNextFeature) - { - break; - } - if (!m_asAttributeFilterConstraints.empty() && - !SkipToNextFeatureDueToAttributeFilter()) + } while (!bReturnFeature); + } + else if (iCol >= 0 && + m_aeGeomEncoding[m_iGeomFieldFilter] == + OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTIPOINT) + { + bool bReturnFeature; + do + { + bReturnFeature = false; + auto array = m_poBatchColumns[iCol].get(); + CPLAssert(array->type_id() == arrow::Type::LIST); + const auto listArray = + static_cast(array); + CPLAssert(listArray->values()->type_id() == + arrow::Type::STRUCT); + auto pointValues = std::static_pointer_cast( + listArray->values()); + const auto &fields = pointValues->fields(); + const auto &fieldX = fields[0]; + CPLAssert(fieldX->type_id() == arrow::Type::DOUBLE); + const auto fieldXDouble = + static_cast(fieldX.get()); + const auto &fieldY = fields[1]; + CPLAssert(fieldY->type_id() == arrow::Type::DOUBLE); + const auto fieldYDouble = + static_cast(fieldY.get()); + + while (true) { - break; - } + bool bMatchBBOX = false; + if (!listArray->IsNull(m_nIdxInBatch)) + { + const auto nPoints = + listArray->value_length(m_nIdxInBatch); + const auto nPointOffset = + listArray->value_offset(m_nIdxInBatch); + if (nPoints > 0) + { + const double *padfRawXValue = + fieldXDouble->raw_values() + nPointOffset; + const double *padfRawYValue = + fieldYDouble->raw_values() + nPointOffset; + for (auto l = decltype(nPoints){0}; l < nPoints; + ++l) + { + if (padfRawXValue[l] >= + m_sFilterEnvelope.MinX && + padfRawYValue[l] >= + m_sFilterEnvelope.MinY && + padfRawXValue[l] <= + m_sFilterEnvelope.MaxX && + padfRawYValue[l] <= m_sFilterEnvelope.MaxY) + { + bMatchBBOX = true; + break; + } + } + } + } + if (bMatchBBOX && + (m_asAttributeFilterConstraints.empty() || + !SkipToNextFeatureDueToAttributeFilter())) + { + bReturnFeature = true; + break; + } - m_nFeatureIdx++; - m_nIdxInBatch++; - if (m_nIdxInBatch == m_poBatch->num_rows()) - { - m_bEOF = !ReadNextBatch(); - if (m_bEOF) - return nullptr; - goto begin_multipolygon; + IncrFeatureIdx(); + m_nIdxInBatch++; + if (m_nIdxInBatch == m_poBatch->num_rows()) + { + m_bEOF = !ReadNextBatch(); + if (m_bEOF) + return nullptr; + break; + } } - } + } while (!bReturnFeature); } - else if (iCol >= 0) + else if (iCol >= 0 && + m_aeGeomEncoding[m_iGeomFieldFilter] == + OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTILINESTRING) { - auto array = m_poBatchColumns[iCol].get(); - while (true) + bool bReturnFeature; + do { - bool bSkipToNextFeature = false; - if (m_poArrayXMinFloat || m_poArrayXMinDouble) - { - bSkipToNextFeature = SkipToNextFeatureDueToBBOX(); - } - else + bReturnFeature = false; + auto array = m_poBatchColumns[iCol].get(); + CPLAssert(array->type_id() == arrow::Type::LIST); + auto listOfPartsArray = + static_cast(array); + CPLAssert(listOfPartsArray->values()->type_id() == + arrow::Type::LIST); + auto listOfPartsValues = + std::static_pointer_cast( + listOfPartsArray->values()); + CPLAssert(listOfPartsValues->values()->type_id() == + arrow::Type::STRUCT); + auto pointValues = std::static_pointer_cast( + listOfPartsValues->values()); + const auto &fields = pointValues->fields(); + const auto &fieldX = fields[0]; + CPLAssert(fieldX->type_id() == arrow::Type::DOUBLE); + const auto fieldXDouble = + static_cast(fieldX.get()); + const auto &fieldY = fields[1]; + CPLAssert(fieldY->type_id() == arrow::Type::DOUBLE); + const auto fieldYDouble = + static_cast(fieldY.get()); + + while (true) { - auto poGeometry = std::unique_ptr( - ReadGeometry(m_iGeomFieldFilter, array, m_nIdxInBatch)); - if (poGeometry == nullptr || poGeometry->IsEmpty()) + bool bMatchBBOX = false; + if (!listOfPartsArray->IsNull(m_nIdxInBatch)) + { + const auto nParts = + listOfPartsArray->value_length(m_nIdxInBatch); + const auto nPartOffset = + listOfPartsArray->value_offset(m_nIdxInBatch); + for (auto j = decltype(nParts){0}; + j < nParts && !bMatchBBOX; j++) + { + OGREnvelope sEnvelope; + const auto nPoints = + listOfPartsValues->value_length(nPartOffset + + j); + const auto nPointOffset = + listOfPartsValues->value_offset(nPartOffset + + j); + const double *padfRawXValue = + fieldXDouble->raw_values() + nPointOffset; + const double *padfRawYValue = + fieldYDouble->raw_values() + nPointOffset; + for (auto l = decltype(nPoints){0}; l < nPoints; + ++l) + { + sEnvelope.Merge(padfRawXValue[l], + padfRawYValue[l]); + } + + if (m_sFilterEnvelope.Intersects(sEnvelope)) + { + bMatchBBOX = true; + } + } + } + if (bMatchBBOX && + (m_asAttributeFilterConstraints.empty() || + !SkipToNextFeatureDueToAttributeFilter())) { - bSkipToNextFeature = true; + bReturnFeature = true; + break; } - else + + IncrFeatureIdx(); + m_nIdxInBatch++; + if (m_nIdxInBatch == m_poBatch->num_rows()) { - OGREnvelope sEnvelope; - poGeometry->getEnvelope(&sEnvelope); - if (!m_sFilterEnvelope.Intersects(sEnvelope)) + m_bEOF = !ReadNextBatch(); + if (m_bEOF) + return nullptr; + break; + } + } + } while (!bReturnFeature); + } + else if (iCol >= 0 && + m_aeGeomEncoding[m_iGeomFieldFilter] == + OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTIPOLYGON) + { + bool bReturnFeature; + do + { + bReturnFeature = false; + auto array = m_poBatchColumns[iCol].get(); + CPLAssert(array->type_id() == arrow::Type::LIST); + auto listOfPartsArray = + static_cast(array); + CPLAssert(listOfPartsArray->values()->type_id() == + arrow::Type::LIST); + auto listOfPartsValues = + std::static_pointer_cast( + listOfPartsArray->values()); + CPLAssert(listOfPartsValues->values()->type_id() == + arrow::Type::LIST); + auto listOfRingsValues = + std::static_pointer_cast( + listOfPartsValues->values()); + CPLAssert(listOfRingsValues->values()->type_id() == + arrow::Type::STRUCT); + auto pointValues = std::static_pointer_cast( + listOfRingsValues->values()); + const auto &fields = pointValues->fields(); + const auto &fieldX = fields[0]; + CPLAssert(fieldX->type_id() == arrow::Type::DOUBLE); + const auto fieldXDouble = + static_cast(fieldX.get()); + const auto &fieldY = fields[1]; + CPLAssert(fieldY->type_id() == arrow::Type::DOUBLE); + const auto fieldYDouble = + static_cast(fieldY.get()); + + while (true) + { + bool bMatchBBOX = false; + if (!listOfPartsArray->IsNull(m_nIdxInBatch)) + { + const auto nParts = + listOfPartsArray->value_length(m_nIdxInBatch); + const auto nPartOffset = + listOfPartsArray->value_offset(m_nIdxInBatch); + for (auto j = decltype(nParts){0}; + j < nParts && !bMatchBBOX; j++) { - bSkipToNextFeature = true; + OGREnvelope sEnvelope; + const auto nRings = listOfPartsValues->value_length( + nPartOffset + j); + const auto nRingOffset = + listOfPartsValues->value_offset(nPartOffset + + j); + if (nRings >= 1) + { + const auto nPoints = + listOfRingsValues->value_length( + nRingOffset); + const auto nPointOffset = + listOfRingsValues->value_offset( + nRingOffset); + const double *padfRawXValue = + fieldXDouble->raw_values() + nPointOffset; + const double *padfRawYValue = + fieldYDouble->raw_values() + nPointOffset; + for (auto l = decltype(nPoints){0}; l < nPoints; + ++l) + { + sEnvelope.Merge(padfRawXValue[l], + padfRawYValue[l]); + } + + if (m_sFilterEnvelope.Intersects(sEnvelope)) + { + bMatchBBOX = true; + } + // for bounding box, only the first ring matters + } } } + if (bMatchBBOX && + (m_asAttributeFilterConstraints.empty() || + !SkipToNextFeatureDueToAttributeFilter())) + { + bReturnFeature = true; + break; + } + + IncrFeatureIdx(); + m_nIdxInBatch++; + if (m_nIdxInBatch == m_poBatch->num_rows()) + { + m_bEOF = !ReadNextBatch(); + if (m_bEOF) + return nullptr; + break; + } } - if (!bSkipToNextFeature) + } while (!bReturnFeature); + } + else if (iCol >= 0) + { + auto array = m_poBatchColumns[iCol].get(); + while (true) + { + bool bMatchBBOX = false; + + auto poGeometry = std::unique_ptr( + ReadGeometry(m_iGeomFieldFilter, array, m_nIdxInBatch)); + if (poGeometry && !poGeometry->IsEmpty()) { - break; + OGREnvelope sEnvelope; + poGeometry->getEnvelope(&sEnvelope); + if (m_sFilterEnvelope.Intersects(sEnvelope)) + { + bMatchBBOX = true; + } } - if (!m_asAttributeFilterConstraints.empty() && - !SkipToNextFeatureDueToAttributeFilter()) + if (bMatchBBOX && (m_asAttributeFilterConstraints.empty() || + !SkipToNextFeatureDueToAttributeFilter())) { break; } - m_nFeatureIdx++; + IncrFeatureIdx(); m_nIdxInBatch++; if (m_nIdxInBatch == m_poBatch->num_rows()) { @@ -3738,7 +4689,7 @@ inline OGRFeature *OGRArrowLayer::GetNextRawFeature() break; } - m_nFeatureIdx++; + IncrFeatureIdx(); m_nIdxInBatch++; if (m_nIdxInBatch == m_poBatch->num_rows()) { @@ -3754,7 +4705,7 @@ inline OGRFeature *OGRArrowLayer::GetNextRawFeature() if (m_iFIDArrowColumn < 0) poFeature->SetFID(m_nFeatureIdx); - m_nFeatureIdx++; + IncrFeatureIdx(); m_nIdxInBatch++; return poFeature; @@ -3970,7 +4921,6 @@ inline OGRErr OGRArrowLayer::GetExtent(int iGeomField, OGREnvelope *psExtent, } } - m_nFeatureIdx++; m_nIdxInBatch++; if (m_nIdxInBatch == m_poBatch->num_rows()) { @@ -4000,7 +4950,7 @@ inline OGRErr OGRArrowLayer::GetExtent(int iGeomField, OGREnvelope *psExtent, } } else if (m_aeGeomEncoding[iGeomField] == - OGRArrowGeomEncoding::GEOARROW_MULTIPOLYGON) + OGRArrowGeomEncoding::GEOARROW_FSL_MULTIPOLYGON) { ResetReading(); if (m_poBatch == nullptr) @@ -4070,7 +5020,6 @@ inline OGRErr OGRArrowLayer::GetExtent(int iGeomField, OGREnvelope *psExtent, } } - m_nFeatureIdx++; m_nIdxInBatch++; if (m_nIdxInBatch == m_poBatch->num_rows()) { @@ -4599,7 +5548,11 @@ inline int OGRArrowLayer::GetNextArrowArray(struct ArrowArrayStream *stream, OverrideArrowRelease(m_poArrowDS, out_array); const auto nFeatureIdxCur = m_nFeatureIdx; - m_nFeatureIdx += m_nIdxInBatch; + // TODO: We likely have an issue regarding FIDs based on m_nFeatureIdx + // when m_iFIDArrowColumn < 0, only a subset of row groups is + // selected, and this batch goes accross non consecutive row groups. + for (int64_t i = 0; i < m_nIdxInBatch; ++i) + IncrFeatureIdx(); if (m_poAttrQuery || m_poFilterGeom) { diff --git a/ogr/ogrsf_frmts/arrow_common/ograrrowwriterlayer.hpp b/ogr/ogrsf_frmts/arrow_common/ograrrowwriterlayer.hpp index 278d98e86fc1..32e01cd0b6ab 100644 --- a/ogr/ogrsf_frmts/arrow_common/ograrrowwriterlayer.hpp +++ b/ogr/ogrsf_frmts/arrow_common/ograrrowwriterlayer.hpp @@ -60,6 +60,13 @@ static constexpr int TZFLAG_UNINITIALIZED = -1; #define OGR_ARROW_RETURN_OGRERR_NOT_OK(status) \ OGR_ARROW_RETURN_NOT_OK(status, OGRERR_FAILURE) +#define OGR_ARROW_PROPAGATE_OGRERR(ret_value) \ + do \ + { \ + if ((ret_value) != OGRERR_NONE) \ + return OGRERR_FAILURE; \ + } while (0) + /************************************************************************/ /* OGRArrowWriterLayer() */ /************************************************************************/ @@ -275,6 +282,8 @@ inline void OGRArrowWriterLayer::CreateSchemaCommon() 2 + (OGR_GT_HasZ(eGType) ? 1 : 0) + (OGR_GT_HasM(eGType) ? 1 : 0); const bool pointFieldNullable = GetDriverUCName() == "PARQUET"; + + // Fixed Size List GeoArrow encoding std::shared_ptr pointField; if (nDim == 2) pointField = @@ -289,6 +298,20 @@ inline void OGRArrowWriterLayer::CreateSchemaCommon() pointField = arrow::field("xyzm", arrow::float64(), pointFieldNullable); + // Struct GeoArrow encoding + auto xField(arrow::field("x", arrow::float64(), false)); + auto yField(arrow::field("y", arrow::float64(), false)); + std::vector> pointFields{ + arrow::field("x", arrow::float64(), false), + arrow::field("y", arrow::float64(), false)}; + if (OGR_GT_HasZ(eGType)) + pointFields.emplace_back( + arrow::field("z", arrow::float64(), false)); + if (OGR_GT_HasM(eGType)) + pointFields.emplace_back( + arrow::field("m", arrow::float64(), false)); + auto pointStructType(arrow::struct_(std::move(pointFields))); + std::shared_ptr dt; switch (m_aeGeomEncoding[i]) { @@ -300,36 +323,61 @@ inline void OGRArrowWriterLayer::CreateSchemaCommon() dt = arrow::utf8(); break; - case OGRArrowGeomEncoding::GEOARROW_GENERIC: + case OGRArrowGeomEncoding::GEOARROW_FSL_GENERIC: + case OGRArrowGeomEncoding::GEOARROW_STRUCT_GENERIC: CPLAssert(false); break; - case OGRArrowGeomEncoding::GEOARROW_POINT: + case OGRArrowGeomEncoding::GEOARROW_FSL_POINT: dt = arrow::fixed_size_list(pointField, nDim); break; - case OGRArrowGeomEncoding::GEOARROW_LINESTRING: + case OGRArrowGeomEncoding::GEOARROW_FSL_LINESTRING: dt = arrow::list(arrow::fixed_size_list(pointField, nDim)); break; - case OGRArrowGeomEncoding::GEOARROW_POLYGON: + case OGRArrowGeomEncoding::GEOARROW_FSL_POLYGON: dt = arrow::list( arrow::list(arrow::fixed_size_list(pointField, nDim))); break; - case OGRArrowGeomEncoding::GEOARROW_MULTIPOINT: + case OGRArrowGeomEncoding::GEOARROW_FSL_MULTIPOINT: dt = arrow::list(arrow::fixed_size_list(pointField, nDim)); break; - case OGRArrowGeomEncoding::GEOARROW_MULTILINESTRING: + case OGRArrowGeomEncoding::GEOARROW_FSL_MULTILINESTRING: dt = arrow::list( arrow::list(arrow::fixed_size_list(pointField, nDim))); break; - case OGRArrowGeomEncoding::GEOARROW_MULTIPOLYGON: + case OGRArrowGeomEncoding::GEOARROW_FSL_MULTIPOLYGON: dt = arrow::list(arrow::list( arrow::list(arrow::fixed_size_list(pointField, nDim)))); break; + + case OGRArrowGeomEncoding::GEOARROW_STRUCT_POINT: + dt = pointStructType; + break; + + case OGRArrowGeomEncoding::GEOARROW_STRUCT_LINESTRING: + dt = arrow::list(pointStructType); + break; + + case OGRArrowGeomEncoding::GEOARROW_STRUCT_POLYGON: + dt = arrow::list(arrow::list(pointStructType)); + break; + + case OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTIPOINT: + dt = arrow::list(pointStructType); + break; + + case OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTILINESTRING: + dt = arrow::list(arrow::list(pointStructType)); + break; + + case OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTIPOLYGON: + dt = arrow::list(arrow::list(arrow::list(pointStructType))); + break; } std::shared_ptr field( @@ -346,6 +394,8 @@ inline void OGRArrowWriterLayer::CreateSchemaCommon() field = field->WithMetadata(kvMetadata); } + m_apoBaseStructGeomType.emplace_back(std::move(pointStructType)); + fields.emplace_back(std::move(field)); } @@ -644,43 +694,57 @@ inline bool OGRArrowWriterLayer::CreateFieldFromArrowSchema( } /************************************************************************/ -/* GetPreciseArrowGeomEncoding() */ +/* GetPreciseArrowGeomEncoding() */ /************************************************************************/ -inline OGRArrowGeomEncoding -OGRArrowWriterLayer::GetPreciseArrowGeomEncoding(OGRwkbGeometryType eGType) +inline OGRArrowGeomEncoding OGRArrowWriterLayer::GetPreciseArrowGeomEncoding( + OGRArrowGeomEncoding eEncodingType, OGRwkbGeometryType eGType) { + CPLAssert(eEncodingType == OGRArrowGeomEncoding::GEOARROW_FSL_GENERIC || + eEncodingType == OGRArrowGeomEncoding::GEOARROW_STRUCT_GENERIC); const auto eFlatType = wkbFlatten(eGType); if (eFlatType == wkbPoint) { - return OGRArrowGeomEncoding::GEOARROW_POINT; + return eEncodingType == OGRArrowGeomEncoding::GEOARROW_FSL_GENERIC + ? OGRArrowGeomEncoding::GEOARROW_FSL_POINT + : OGRArrowGeomEncoding::GEOARROW_STRUCT_POINT; } else if (eFlatType == wkbLineString) { - return OGRArrowGeomEncoding::GEOARROW_LINESTRING; + return eEncodingType == OGRArrowGeomEncoding::GEOARROW_FSL_GENERIC + ? OGRArrowGeomEncoding::GEOARROW_FSL_LINESTRING + : OGRArrowGeomEncoding::GEOARROW_STRUCT_LINESTRING; } else if (eFlatType == wkbPolygon) { - return OGRArrowGeomEncoding::GEOARROW_POLYGON; + return eEncodingType == OGRArrowGeomEncoding::GEOARROW_FSL_GENERIC + ? OGRArrowGeomEncoding::GEOARROW_FSL_POLYGON + : OGRArrowGeomEncoding::GEOARROW_STRUCT_POLYGON; } else if (eFlatType == wkbMultiPoint) { - return OGRArrowGeomEncoding::GEOARROW_MULTIPOINT; + return eEncodingType == OGRArrowGeomEncoding::GEOARROW_FSL_GENERIC + ? OGRArrowGeomEncoding::GEOARROW_FSL_MULTIPOINT + : OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTIPOINT; } else if (eFlatType == wkbMultiLineString) { - return OGRArrowGeomEncoding::GEOARROW_MULTILINESTRING; + return eEncodingType == OGRArrowGeomEncoding::GEOARROW_FSL_GENERIC + ? OGRArrowGeomEncoding::GEOARROW_FSL_MULTILINESTRING + : OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTILINESTRING; } else if (eFlatType == wkbMultiPolygon) { - return OGRArrowGeomEncoding::GEOARROW_MULTIPOLYGON; + return eEncodingType == OGRArrowGeomEncoding::GEOARROW_FSL_GENERIC + ? OGRArrowGeomEncoding::GEOARROW_FSL_MULTIPOLYGON + : OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTIPOLYGON; } else { CPLError(CE_Failure, CPLE_NotSupported, - "GEOMETRY_FORMAT=GEOARROW is currently not supported for %s", + "GeoArrow encoding is currently not supported for %s", OGRGeometryTypeToName(eGType)); - return OGRArrowGeomEncoding::GEOARROW_GENERIC; + return eEncodingType; } } @@ -695,24 +759,38 @@ OGRArrowWriterLayer::GetGeomEncodingAsString(OGRArrowGeomEncoding eGeomEncoding, switch (eGeomEncoding) { case OGRArrowGeomEncoding::WKB: - return bForParquetGeo ? "WKB" : "ogc.wkb"; + return bForParquetGeo ? "WKB" : "geoarrow.wkb"; case OGRArrowGeomEncoding::WKT: - return bForParquetGeo ? "WKT" : "ogc.wkt"; - case OGRArrowGeomEncoding::GEOARROW_GENERIC: + return bForParquetGeo ? "WKT" : "geoarrow.wkt"; + case OGRArrowGeomEncoding::GEOARROW_FSL_GENERIC: + case OGRArrowGeomEncoding::GEOARROW_STRUCT_GENERIC: CPLAssert(false); break; - case OGRArrowGeomEncoding::GEOARROW_POINT: + case OGRArrowGeomEncoding::GEOARROW_FSL_POINT: return "geoarrow.point"; - case OGRArrowGeomEncoding::GEOARROW_LINESTRING: + case OGRArrowGeomEncoding::GEOARROW_FSL_LINESTRING: return "geoarrow.linestring"; - case OGRArrowGeomEncoding::GEOARROW_POLYGON: + case OGRArrowGeomEncoding::GEOARROW_FSL_POLYGON: return "geoarrow.polygon"; - case OGRArrowGeomEncoding::GEOARROW_MULTIPOINT: + case OGRArrowGeomEncoding::GEOARROW_FSL_MULTIPOINT: return "geoarrow.multipoint"; - case OGRArrowGeomEncoding::GEOARROW_MULTILINESTRING: + case OGRArrowGeomEncoding::GEOARROW_FSL_MULTILINESTRING: return "geoarrow.multilinestring"; - case OGRArrowGeomEncoding::GEOARROW_MULTIPOLYGON: + case OGRArrowGeomEncoding::GEOARROW_FSL_MULTIPOLYGON: return "geoarrow.multipolygon"; + case OGRArrowGeomEncoding::GEOARROW_STRUCT_POINT: + return bForParquetGeo ? "point" : "geoarrow.point"; + case OGRArrowGeomEncoding::GEOARROW_STRUCT_LINESTRING: + return bForParquetGeo ? "linestring" : "geoarrow.linestring"; + case OGRArrowGeomEncoding::GEOARROW_STRUCT_POLYGON: + return bForParquetGeo ? "polygon" : "geoarrow.polygon"; + case OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTIPOINT: + return bForParquetGeo ? "multipoint" : "geoarrow.multipoint"; + case OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTILINESTRING: + return bForParquetGeo ? "multilinestring" + : "geoarrow.multilinestring"; + case OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTIPOLYGON: + return bForParquetGeo ? "multipolygon" : "geoarrow.multipolygon"; } return nullptr; } @@ -743,10 +821,12 @@ OGRArrowWriterLayer::CreateGeomField(const OGRGeomFieldDefn *poField, "Geometry column should have an associated CRS"); } auto eGeomEncoding = m_eGeomEncoding; - if (eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_GENERIC) + if (eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_FSL_GENERIC || + eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_STRUCT_GENERIC) { - eGeomEncoding = GetPreciseArrowGeomEncoding(eGType); - if (eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_GENERIC) + const auto eEncodingType = eGeomEncoding; + eGeomEncoding = GetPreciseArrowGeomEncoding(eEncodingType, eGType); + if (eGeomEncoding == eEncodingType) return OGRERR_FAILURE; } m_aeGeomEncoding.push_back(eGeomEncoding); @@ -770,6 +850,29 @@ MakeGeoArrowBuilder(arrow::MemoryPool *poMemoryPool, int nDim, int nDepth) poMemoryPool, MakeGeoArrowBuilder(poMemoryPool, nDim, nDepth - 1)); } +/************************************************************************/ +/* MakeGeoArrowStructBuilder() */ +/************************************************************************/ + +static std::shared_ptr +MakeGeoArrowStructBuilder(arrow::MemoryPool *poMemoryPool, int nDim, int nDepth, + const std::shared_ptr &eBaseType) +{ + if (nDepth == 0) + { + std::vector> builders; + for (int i = 0; i < nDim; ++i) + builders.emplace_back( + std::make_shared(poMemoryPool)); + return std::make_shared(eBaseType, poMemoryPool, + std::move(builders)); + } + else + return std::make_shared( + poMemoryPool, MakeGeoArrowStructBuilder(poMemoryPool, nDim, + nDepth - 1, eBaseType)); +} + /************************************************************************/ /* ClearArrayBuilers() */ /************************************************************************/ @@ -926,42 +1029,78 @@ inline void OGRArrowWriterLayer::CreateArrayBuilders() const int nDim = 2 + (OGR_GT_HasZ(eGType) ? 1 : 0) + (OGR_GT_HasM(eGType) ? 1 : 0); - if (m_aeGeomEncoding[i] == OGRArrowGeomEncoding::WKB) - builder = std::make_shared(m_poMemoryPool); - else if (m_aeGeomEncoding[i] == OGRArrowGeomEncoding::WKT) - builder = std::make_shared(m_poMemoryPool); - else if (m_aeGeomEncoding[i] == OGRArrowGeomEncoding::GEOARROW_POINT) - { - builder = MakeGeoArrowBuilder(m_poMemoryPool, nDim, 0); - } - else if (m_aeGeomEncoding[i] == - OGRArrowGeomEncoding::GEOARROW_LINESTRING) - { - builder = MakeGeoArrowBuilder(m_poMemoryPool, nDim, 1); - } - else if (m_aeGeomEncoding[i] == OGRArrowGeomEncoding::GEOARROW_POLYGON) - { - builder = MakeGeoArrowBuilder(m_poMemoryPool, nDim, 2); - } - else if (m_aeGeomEncoding[i] == - OGRArrowGeomEncoding::GEOARROW_MULTIPOINT) - { - builder = MakeGeoArrowBuilder(m_poMemoryPool, nDim, 1); - } - else if (m_aeGeomEncoding[i] == - OGRArrowGeomEncoding::GEOARROW_MULTILINESTRING) - { - builder = MakeGeoArrowBuilder(m_poMemoryPool, nDim, 2); - } - else if (m_aeGeomEncoding[i] == - OGRArrowGeomEncoding::GEOARROW_MULTIPOLYGON) - { - builder = MakeGeoArrowBuilder(m_poMemoryPool, nDim, 3); - } - else + switch (m_aeGeomEncoding[i]) { - CPLAssert(false); + case OGRArrowGeomEncoding::WKB: + builder = + std::make_shared(m_poMemoryPool); + break; + + case OGRArrowGeomEncoding::WKT: + builder = + std::make_shared(m_poMemoryPool); + break; + + case OGRArrowGeomEncoding::GEOARROW_FSL_POINT: + builder = MakeGeoArrowBuilder(m_poMemoryPool, nDim, 0); + break; + + case OGRArrowGeomEncoding::GEOARROW_FSL_LINESTRING: + builder = MakeGeoArrowBuilder(m_poMemoryPool, nDim, 1); + break; + + case OGRArrowGeomEncoding::GEOARROW_FSL_POLYGON: + builder = MakeGeoArrowBuilder(m_poMemoryPool, nDim, 2); + break; + + case OGRArrowGeomEncoding::GEOARROW_FSL_MULTIPOINT: + builder = MakeGeoArrowBuilder(m_poMemoryPool, nDim, 1); + break; + + case OGRArrowGeomEncoding::GEOARROW_FSL_MULTILINESTRING: + builder = MakeGeoArrowBuilder(m_poMemoryPool, nDim, 2); + break; + + case OGRArrowGeomEncoding::GEOARROW_FSL_MULTIPOLYGON: + builder = MakeGeoArrowBuilder(m_poMemoryPool, nDim, 3); + break; + + case OGRArrowGeomEncoding::GEOARROW_STRUCT_POINT: + builder = MakeGeoArrowStructBuilder(m_poMemoryPool, nDim, 0, + m_apoBaseStructGeomType[i]); + break; + + case OGRArrowGeomEncoding::GEOARROW_STRUCT_LINESTRING: + builder = MakeGeoArrowStructBuilder(m_poMemoryPool, nDim, 1, + m_apoBaseStructGeomType[i]); + break; + + case OGRArrowGeomEncoding::GEOARROW_STRUCT_POLYGON: + builder = MakeGeoArrowStructBuilder(m_poMemoryPool, nDim, 2, + m_apoBaseStructGeomType[i]); + break; + + case OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTIPOINT: + builder = MakeGeoArrowStructBuilder(m_poMemoryPool, nDim, 1, + m_apoBaseStructGeomType[i]); + break; + + case OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTILINESTRING: + builder = MakeGeoArrowStructBuilder(m_poMemoryPool, nDim, 2, + m_apoBaseStructGeomType[i]); + break; + + case OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTIPOLYGON: + builder = MakeGeoArrowStructBuilder(m_poMemoryPool, nDim, 3, + m_apoBaseStructGeomType[i]); + break; + + case OGRArrowGeomEncoding::GEOARROW_FSL_GENERIC: + case OGRArrowGeomEncoding::GEOARROW_STRUCT_GENERIC: + CPLAssert(false); + break; } + m_apoBuilders.emplace_back(builder); if (m_bWriteBBoxStruct) @@ -1021,6 +1160,31 @@ static float castToFloatUp(double d) return f; } +/************************************************************************/ +/* GeoArrowLineBuilder() */ +/************************************************************************/ + +template +static OGRErr GeoArrowLineBuilder(const OGRLineString *poLS, + PointBuilderType *poPointBuilder, + arrow::DoubleBuilder *poXBuilder, + arrow::DoubleBuilder *poYBuilder, + arrow::DoubleBuilder *poZBuilder, + arrow::DoubleBuilder *poMBuilder) +{ + for (int j = 0; j < poLS->getNumPoints(); ++j) + { + OGR_ARROW_RETURN_OGRERR_NOT_OK(poPointBuilder->Append()); + OGR_ARROW_RETURN_OGRERR_NOT_OK(poXBuilder->Append(poLS->getX(j))); + OGR_ARROW_RETURN_OGRERR_NOT_OK(poYBuilder->Append(poLS->getY(j))); + if (poZBuilder) + OGR_ARROW_RETURN_OGRERR_NOT_OK(poZBuilder->Append(poLS->getZ(j))); + if (poMBuilder) + OGR_ARROW_RETURN_OGRERR_NOT_OK(poMBuilder->Append(poLS->getM(j))); + } + return OGRERR_NONE; +} + /************************************************************************/ /* BuildGeometry() */ /************************************************************************/ @@ -1032,6 +1196,8 @@ inline OGRErr OGRArrowWriterLayer::BuildGeometry(OGRGeometry *poGeom, const auto eGType = poGeom ? poGeom->getGeometryType() : wkbNone; const auto eColumnGType = m_poFeatureDefn->GetGeomFieldDefn(iGeomField)->GetType(); + const bool bHasZ = CPL_TO_BOOL(OGR_GT_HasZ(eColumnGType)); + const bool bHasM = CPL_TO_BOOL(OGR_GT_HasM(eColumnGType)); const bool bIsEmpty = poGeom != nullptr && poGeom->IsEmpty(); OGREnvelope3D oEnvelope; if (poGeom != nullptr && !bIsEmpty) @@ -1078,7 +1244,7 @@ inline OGRErr OGRArrowWriterLayer::BuildGeometry(OGRGeometry *poGeom, if (poGeom == nullptr) { if (m_aeGeomEncoding[iGeomField] == - OGRArrowGeomEncoding::GEOARROW_POINT && + OGRArrowGeomEncoding::GEOARROW_FSL_POINT && GetDriverUCName() == "PARQUET") { // For some reason, Parquet doesn't support a NULL FixedSizeList @@ -1103,265 +1269,408 @@ inline OGRErr OGRArrowWriterLayer::BuildGeometry(OGRGeometry *poGeom, { OGR_ARROW_RETURN_OGRERR_NOT_OK(poBuilder->AppendNull()); } + + return OGRERR_NONE; } - else if (m_aeGeomEncoding[iGeomField] == OGRArrowGeomEncoding::WKB) + + // The following checks are only valid for GeoArrow encoding + if (m_aeGeomEncoding[iGeomField] != OGRArrowGeomEncoding::WKB && + m_aeGeomEncoding[iGeomField] != OGRArrowGeomEncoding::WKT) { - std::unique_ptr poGeomModified; - if (OGR_GT_HasM(eGType) && !OGR_GT_HasM(eColumnGType)) + if ((!bIsEmpty && eGType != eColumnGType) || + (bIsEmpty && wkbFlatten(eGType) != wkbFlatten(eColumnGType))) { - static bool bHasWarned = false; - if (!bHasWarned) + CPLError(CE_Warning, CPLE_AppDefined, + "Geometry of type %s found, whereas %s is expected. " + "Writing null geometry", + OGRGeometryTypeToName(eGType), + OGRGeometryTypeToName(eColumnGType)); + OGR_ARROW_RETURN_OGRERR_NOT_OK(poBuilder->AppendNull()); + + return OGRERR_NONE; + } + } + + switch (m_aeGeomEncoding[iGeomField]) + { + case OGRArrowGeomEncoding::WKB: + { + std::unique_ptr poGeomModified; + if (OGR_GT_HasM(eGType) && !OGR_GT_HasM(eColumnGType)) + { + static bool bHasWarned = false; + if (!bHasWarned) + { + CPLError(CE_Warning, CPLE_AppDefined, + "Removing M component from geometry"); + bHasWarned = true; + } + poGeomModified.reset(poGeom->clone()); + poGeomModified->setMeasured(false); + poGeom = poGeomModified.get(); + } + FixupGeometryBeforeWriting(poGeom); + const auto nSize = poGeom->WkbSize(); + if (nSize < INT_MAX) + { + m_abyBuffer.resize(nSize); + poGeom->exportToWkb(wkbNDR, &m_abyBuffer[0], wkbVariantIso); + OGR_ARROW_RETURN_OGRERR_NOT_OK( + static_cast(poBuilder)->Append( + m_abyBuffer.data(), + static_cast(m_abyBuffer.size()))); + } + else { CPLError(CE_Warning, CPLE_AppDefined, - "Removing M component from geometry"); - bHasWarned = true; + "Too big geometry. " + "Writing null geometry"); + OGR_ARROW_RETURN_OGRERR_NOT_OK(poBuilder->AppendNull()); } - poGeomModified.reset(poGeom->clone()); - poGeomModified->setMeasured(false); - poGeom = poGeomModified.get(); + break; } - FixupGeometryBeforeWriting(poGeom); - const auto nSize = poGeom->WkbSize(); - if (nSize < INT_MAX) + + case OGRArrowGeomEncoding::WKT: { - m_abyBuffer.resize(nSize); - poGeom->exportToWkb(wkbNDR, &m_abyBuffer[0], wkbVariantIso); + OGRWktOptions options; + options.variant = wkbVariantIso; + if (m_nWKTCoordinatePrecision >= 0) + { + options.format = OGRWktFormat::F; + options.xyPrecision = m_nWKTCoordinatePrecision; + options.zPrecision = m_nWKTCoordinatePrecision; + options.mPrecision = m_nWKTCoordinatePrecision; + } OGR_ARROW_RETURN_OGRERR_NOT_OK( - static_cast(poBuilder)->Append( - m_abyBuffer.data(), static_cast(m_abyBuffer.size()))); + static_cast(poBuilder)->Append( + poGeom->exportToWkt(options))); + break; } - else + + case OGRArrowGeomEncoding::GEOARROW_FSL_POINT: { - CPLError(CE_Warning, CPLE_AppDefined, - "Too big geometry. " - "Writing null geometry"); - OGR_ARROW_RETURN_OGRERR_NOT_OK(poBuilder->AppendNull()); + const auto poPoint = poGeom->toPoint(); + auto poPointBuilder = + static_cast(poBuilder); + OGR_ARROW_RETURN_OGRERR_NOT_OK(poPointBuilder->Append()); + auto poValueBuilder = static_cast( + poPointBuilder->value_builder()); + if (bIsEmpty) + { + OGR_ARROW_RETURN_OGRERR_NOT_OK(poValueBuilder->Append( + std::numeric_limits::quiet_NaN())); + OGR_ARROW_RETURN_OGRERR_NOT_OK(poValueBuilder->Append( + std::numeric_limits::quiet_NaN())); + } + else + { + OGR_ARROW_RETURN_OGRERR_NOT_OK( + poValueBuilder->Append(poPoint->getX())); + OGR_ARROW_RETURN_OGRERR_NOT_OK( + poValueBuilder->Append(poPoint->getY())); + } + if (bHasZ) + OGR_ARROW_RETURN_OGRERR_NOT_OK( + poValueBuilder->Append(poPoint->getZ())); + if (bHasM) + OGR_ARROW_RETURN_OGRERR_NOT_OK( + poValueBuilder->Append(poPoint->getM())); + break; } - } - else if (m_aeGeomEncoding[iGeomField] == OGRArrowGeomEncoding::WKT) - { - OGRWktOptions options; - options.variant = wkbVariantIso; - if (m_nWKTCoordinatePrecision >= 0) + +#define GET_XYZM_STRUCT_FIELD_BUILDERS_FROM(poPointBuilder) \ + auto poXBuilder = \ + static_cast(poPointBuilder->field_builder(0)); \ + auto poYBuilder = \ + static_cast(poPointBuilder->field_builder(1)); \ + int iSubField = 2; \ + arrow::DoubleBuilder *poZBuilder = nullptr; \ + if (bHasZ) \ + { \ + poZBuilder = static_cast( \ + poPointBuilder->field_builder(iSubField)); \ + ++iSubField; \ + } \ + arrow::DoubleBuilder *poMBuilder = nullptr; \ + if (bHasM) \ + { \ + poMBuilder = static_cast( \ + poPointBuilder->field_builder(iSubField)); \ + } \ + do \ + { \ + } while (0) + + case OGRArrowGeomEncoding::GEOARROW_STRUCT_POINT: { - options.format = OGRWktFormat::F; - options.xyPrecision = m_nWKTCoordinatePrecision; - options.zPrecision = m_nWKTCoordinatePrecision; - options.mPrecision = m_nWKTCoordinatePrecision; + const auto poPoint = poGeom->toPoint(); + auto poPointBuilder = + static_cast(poBuilder); + GET_XYZM_STRUCT_FIELD_BUILDERS_FROM(poPointBuilder); + OGR_ARROW_RETURN_OGRERR_NOT_OK(poPointBuilder->Append()); + + if (bIsEmpty) + { + OGR_ARROW_RETURN_OGRERR_NOT_OK(poXBuilder->Append( + std::numeric_limits::quiet_NaN())); + OGR_ARROW_RETURN_OGRERR_NOT_OK(poYBuilder->Append( + std::numeric_limits::quiet_NaN())); + } + else + { + OGR_ARROW_RETURN_OGRERR_NOT_OK( + poXBuilder->Append(poPoint->getX())); + OGR_ARROW_RETURN_OGRERR_NOT_OK( + poYBuilder->Append(poPoint->getY())); + } + if (poZBuilder) + { + OGR_ARROW_RETURN_OGRERR_NOT_OK(poZBuilder->Append( + bIsEmpty ? std::numeric_limits::quiet_NaN() + : poPoint->getZ())); + } + if (poMBuilder) + { + OGR_ARROW_RETURN_OGRERR_NOT_OK(poMBuilder->Append( + bIsEmpty ? std::numeric_limits::quiet_NaN() + : poPoint->getM())); + } + break; } - OGR_ARROW_RETURN_OGRERR_NOT_OK( - static_cast(poBuilder)->Append( - poGeom->exportToWkt(options))); - } - // The following checks are only valid for GeoArrow encoding - else if ((!bIsEmpty && eGType != eColumnGType) || - (bIsEmpty && wkbFlatten(eGType) != wkbFlatten(eColumnGType))) - { - CPLError(CE_Warning, CPLE_AppDefined, - "Geometry of type %s found, whereas %s is expected. " - "Writing null geometry", - OGRGeometryTypeToName(eGType), - OGRGeometryTypeToName(eColumnGType)); - OGR_ARROW_RETURN_OGRERR_NOT_OK(poBuilder->AppendNull()); - } - else if (!bIsEmpty && poGeom->Is3D() != OGR_GT_HasZ(eColumnGType)) - { - CPLError(CE_Warning, CPLE_AppDefined, - "Geometry Z flag (%d) != column geometry type Z flag (%d)d. " - "Writing null geometry", - poGeom->Is3D(), OGR_GT_HasZ(eColumnGType)); - OGR_ARROW_RETURN_OGRERR_NOT_OK(poBuilder->AppendNull()); - } - else if (!bIsEmpty && poGeom->IsMeasured() != OGR_GT_HasM(eColumnGType)) - { - CPLError(CE_Warning, CPLE_AppDefined, - "Geometry M flag (%d) != column geometry type M flag (%d)d. " - "Writing null geometry", - poGeom->IsMeasured(), OGR_GT_HasM(eColumnGType)); - OGR_ARROW_RETURN_OGRERR_NOT_OK(poBuilder->AppendNull()); - } - else if (m_aeGeomEncoding[iGeomField] == - OGRArrowGeomEncoding::GEOARROW_POINT) - { - const auto poPoint = poGeom->toPoint(); - auto poPointBuilder = - static_cast(poBuilder); - OGR_ARROW_RETURN_OGRERR_NOT_OK(poPointBuilder->Append()); - auto poValueBuilder = static_cast( - poPointBuilder->value_builder()); - if (bIsEmpty) + + case OGRArrowGeomEncoding::GEOARROW_FSL_LINESTRING: { - OGR_ARROW_RETURN_OGRERR_NOT_OK(poValueBuilder->Append( - std::numeric_limits::quiet_NaN())); - OGR_ARROW_RETURN_OGRERR_NOT_OK(poValueBuilder->Append( - std::numeric_limits::quiet_NaN())); + const auto poLS = poGeom->toLineString(); + auto poListBuilder = static_cast(poBuilder); + auto poPointBuilder = static_cast( + poListBuilder->value_builder()); + auto poValueBuilder = static_cast( + poPointBuilder->value_builder()); + + OGR_ARROW_RETURN_OGRERR_NOT_OK(poListBuilder->Append()); + OGR_ARROW_PROPAGATE_OGRERR(GeoArrowLineBuilder( + poLS, poPointBuilder, poValueBuilder, poValueBuilder, + bHasZ ? poValueBuilder : nullptr, + bHasM ? poValueBuilder : nullptr)); + break; } - else + + case OGRArrowGeomEncoding::GEOARROW_STRUCT_LINESTRING: { - OGR_ARROW_RETURN_OGRERR_NOT_OK( - poValueBuilder->Append(poPoint->getX())); - OGR_ARROW_RETURN_OGRERR_NOT_OK( - poValueBuilder->Append(poPoint->getY())); + const auto poLS = poGeom->toLineString(); + auto poListBuilder = static_cast(poBuilder); + auto poPointBuilder = static_cast( + poListBuilder->value_builder()); + GET_XYZM_STRUCT_FIELD_BUILDERS_FROM(poPointBuilder); + + OGR_ARROW_RETURN_OGRERR_NOT_OK(poListBuilder->Append()); + OGR_ARROW_PROPAGATE_OGRERR( + GeoArrowLineBuilder(poLS, poPointBuilder, poXBuilder, + poYBuilder, poZBuilder, poMBuilder)); + break; } - if (OGR_GT_HasZ(eColumnGType)) - OGR_ARROW_RETURN_OGRERR_NOT_OK( - poValueBuilder->Append(poPoint->getZ())); - if (OGR_GT_HasM(eColumnGType)) - OGR_ARROW_RETURN_OGRERR_NOT_OK( - poValueBuilder->Append(poPoint->getM())); - } - else if (m_aeGeomEncoding[iGeomField] == - OGRArrowGeomEncoding::GEOARROW_LINESTRING) - { - const auto poLS = poGeom->toLineString(); - auto poListBuilder = static_cast(poBuilder); - auto poPointBuilder = static_cast( - poListBuilder->value_builder()); - auto poValueBuilder = static_cast( - poPointBuilder->value_builder()); - OGR_ARROW_RETURN_OGRERR_NOT_OK(poListBuilder->Append()); - for (int j = 0; j < poLS->getNumPoints(); ++j) + + case OGRArrowGeomEncoding::GEOARROW_FSL_POLYGON: { - OGR_ARROW_RETURN_OGRERR_NOT_OK(poPointBuilder->Append()); - OGR_ARROW_RETURN_OGRERR_NOT_OK( - poValueBuilder->Append(poLS->getX(j))); - OGR_ARROW_RETURN_OGRERR_NOT_OK( - poValueBuilder->Append(poLS->getY(j))); - if (poGeom->Is3D()) - OGR_ARROW_RETURN_OGRERR_NOT_OK( - poValueBuilder->Append(poLS->getZ(j))); - if (poGeom->IsMeasured()) - OGR_ARROW_RETURN_OGRERR_NOT_OK( - poValueBuilder->Append(poLS->getM(j))); + const auto poPolygon = poGeom->toPolygon(); + auto poPolygonBuilder = + static_cast(poBuilder); + auto poRingBuilder = static_cast( + poPolygonBuilder->value_builder()); + auto poPointBuilder = static_cast( + poRingBuilder->value_builder()); + auto poValueBuilder = static_cast( + poPointBuilder->value_builder()); + OGR_ARROW_RETURN_OGRERR_NOT_OK(poPolygonBuilder->Append()); + for (const auto *poRing : *poPolygon) + { + OGR_ARROW_RETURN_OGRERR_NOT_OK(poRingBuilder->Append()); + OGR_ARROW_PROPAGATE_OGRERR(GeoArrowLineBuilder( + poRing, poPointBuilder, poValueBuilder, poValueBuilder, + bHasZ ? poValueBuilder : nullptr, + bHasM ? poValueBuilder : nullptr)); + } + break; } - } - else if (m_aeGeomEncoding[iGeomField] == - OGRArrowGeomEncoding::GEOARROW_POLYGON) - { - const auto poPolygon = poGeom->toPolygon(); - auto poPolygonBuilder = static_cast(poBuilder); - auto poRingBuilder = static_cast( - poPolygonBuilder->value_builder()); - auto poPointBuilder = static_cast( - poRingBuilder->value_builder()); - auto poValueBuilder = static_cast( - poPointBuilder->value_builder()); - OGR_ARROW_RETURN_OGRERR_NOT_OK(poPolygonBuilder->Append()); - for (const auto *poRing : *poPolygon) + + case OGRArrowGeomEncoding::GEOARROW_STRUCT_POLYGON: + { + const auto poPolygon = poGeom->toPolygon(); + auto poPolygonBuilder = + static_cast(poBuilder); + auto poRingBuilder = static_cast( + poPolygonBuilder->value_builder()); + auto poPointBuilder = static_cast( + poRingBuilder->value_builder()); + GET_XYZM_STRUCT_FIELD_BUILDERS_FROM(poPointBuilder); + + OGR_ARROW_RETURN_OGRERR_NOT_OK(poPolygonBuilder->Append()); + for (const auto *poRing : *poPolygon) + { + OGR_ARROW_RETURN_OGRERR_NOT_OK(poRingBuilder->Append()); + OGR_ARROW_PROPAGATE_OGRERR( + GeoArrowLineBuilder(poRing, poPointBuilder, poXBuilder, + poYBuilder, poZBuilder, poMBuilder)); + } + break; + } + + case OGRArrowGeomEncoding::GEOARROW_FSL_MULTIPOINT: { - OGR_ARROW_RETURN_OGRERR_NOT_OK(poRingBuilder->Append()); - for (int j = 0; j < poRing->getNumPoints(); ++j) + const auto poMultiPoint = poGeom->toMultiPoint(); + auto poListBuilder = static_cast(poBuilder); + auto poPointBuilder = static_cast( + poListBuilder->value_builder()); + auto poValueBuilder = static_cast( + poPointBuilder->value_builder()); + OGR_ARROW_RETURN_OGRERR_NOT_OK(poListBuilder->Append()); + for (const auto *poPoint : *poMultiPoint) { OGR_ARROW_RETURN_OGRERR_NOT_OK(poPointBuilder->Append()); OGR_ARROW_RETURN_OGRERR_NOT_OK( - poValueBuilder->Append(poRing->getX(j))); + poValueBuilder->Append(poPoint->getX())); OGR_ARROW_RETURN_OGRERR_NOT_OK( - poValueBuilder->Append(poRing->getY(j))); - if (poGeom->Is3D()) + poValueBuilder->Append(poPoint->getY())); + if (bHasZ) OGR_ARROW_RETURN_OGRERR_NOT_OK( - poValueBuilder->Append(poRing->getZ(j))); - if (poGeom->IsMeasured()) + poValueBuilder->Append(poPoint->getZ())); + if (bHasM) OGR_ARROW_RETURN_OGRERR_NOT_OK( - poValueBuilder->Append(poRing->getM(j))); + poValueBuilder->Append(poPoint->getM())); } + break; } - } - else if (m_aeGeomEncoding[iGeomField] == - OGRArrowGeomEncoding::GEOARROW_MULTIPOINT) - { - const auto poMultiPoint = poGeom->toMultiPoint(); - auto poListBuilder = static_cast(poBuilder); - auto poPointBuilder = static_cast( - poListBuilder->value_builder()); - auto poValueBuilder = static_cast( - poPointBuilder->value_builder()); - OGR_ARROW_RETURN_OGRERR_NOT_OK(poListBuilder->Append()); - for (const auto *poPoint : *poMultiPoint) - { - OGR_ARROW_RETURN_OGRERR_NOT_OK(poPointBuilder->Append()); - OGR_ARROW_RETURN_OGRERR_NOT_OK( - poValueBuilder->Append(poPoint->getX())); - OGR_ARROW_RETURN_OGRERR_NOT_OK( - poValueBuilder->Append(poPoint->getY())); - if (poGeom->Is3D()) - OGR_ARROW_RETURN_OGRERR_NOT_OK( - poValueBuilder->Append(poPoint->getZ())); - if (poGeom->IsMeasured()) - OGR_ARROW_RETURN_OGRERR_NOT_OK( - poValueBuilder->Append(poPoint->getM())); - } - } - else if (m_aeGeomEncoding[iGeomField] == - OGRArrowGeomEncoding::GEOARROW_MULTILINESTRING) - { - const auto poMLS = poGeom->toMultiLineString(); - auto poMLSBuilder = static_cast(poBuilder); - auto poLSBuilder = - static_cast(poMLSBuilder->value_builder()); - auto poPointBuilder = static_cast( - poLSBuilder->value_builder()); - auto poValueBuilder = static_cast( - poPointBuilder->value_builder()); - OGR_ARROW_RETURN_OGRERR_NOT_OK(poMLSBuilder->Append()); - for (const auto *poLS : *poMLS) + + case OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTIPOINT: { - OGR_ARROW_RETURN_OGRERR_NOT_OK(poLSBuilder->Append()); - for (int j = 0; j < poLS->getNumPoints(); ++j) + const auto poMultiPoint = poGeom->toMultiPoint(); + auto poListBuilder = static_cast(poBuilder); + auto poPointBuilder = static_cast( + poListBuilder->value_builder()); + GET_XYZM_STRUCT_FIELD_BUILDERS_FROM(poPointBuilder); + + OGR_ARROW_RETURN_OGRERR_NOT_OK(poListBuilder->Append()); + for (const auto *poPoint : *poMultiPoint) { OGR_ARROW_RETURN_OGRERR_NOT_OK(poPointBuilder->Append()); OGR_ARROW_RETURN_OGRERR_NOT_OK( - poValueBuilder->Append(poLS->getX(j))); + poXBuilder->Append(poPoint->getX())); OGR_ARROW_RETURN_OGRERR_NOT_OK( - poValueBuilder->Append(poLS->getY(j))); - if (poGeom->Is3D()) + poYBuilder->Append(poPoint->getY())); + if (poZBuilder) OGR_ARROW_RETURN_OGRERR_NOT_OK( - poValueBuilder->Append(poLS->getZ(j))); - if (poGeom->IsMeasured()) + poZBuilder->Append(poPoint->getZ())); + if (poMBuilder) OGR_ARROW_RETURN_OGRERR_NOT_OK( - poValueBuilder->Append(poLS->getM(j))); + poMBuilder->Append(poPoint->getM())); } + break; } - } - else if (m_aeGeomEncoding[iGeomField] == - OGRArrowGeomEncoding::GEOARROW_MULTIPOLYGON) - { - const auto poMPoly = poGeom->toMultiPolygon(); - auto poMPolyBuilder = static_cast(poBuilder); - auto poPolyBuilder = - static_cast(poMPolyBuilder->value_builder()); - auto poRingBuilder = - static_cast(poPolyBuilder->value_builder()); - auto poPointBuilder = static_cast( - poRingBuilder->value_builder()); - auto poValueBuilder = static_cast( - poPointBuilder->value_builder()); - OGR_ARROW_RETURN_OGRERR_NOT_OK(poMPolyBuilder->Append()); - for (const auto *poPolygon : *poMPoly) + + case OGRArrowGeomEncoding::GEOARROW_FSL_MULTILINESTRING: { - OGR_ARROW_RETURN_OGRERR_NOT_OK(poPolyBuilder->Append()); - for (const auto *poRing : *poPolygon) + const auto poMLS = poGeom->toMultiLineString(); + auto poMLSBuilder = static_cast(poBuilder); + auto poLSBuilder = static_cast( + poMLSBuilder->value_builder()); + auto poPointBuilder = static_cast( + poLSBuilder->value_builder()); + auto poValueBuilder = static_cast( + poPointBuilder->value_builder()); + OGR_ARROW_RETURN_OGRERR_NOT_OK(poMLSBuilder->Append()); + for (const auto *poLS : *poMLS) { - OGR_ARROW_RETURN_OGRERR_NOT_OK(poRingBuilder->Append()); - for (int j = 0; j < poRing->getNumPoints(); ++j) + OGR_ARROW_RETURN_OGRERR_NOT_OK(poLSBuilder->Append()); + OGR_ARROW_PROPAGATE_OGRERR(GeoArrowLineBuilder( + poLS, poPointBuilder, poValueBuilder, poValueBuilder, + bHasZ ? poValueBuilder : nullptr, + bHasM ? poValueBuilder : nullptr)); + } + break; + } + + case OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTILINESTRING: + { + const auto poMLS = poGeom->toMultiLineString(); + auto poMLSBuilder = static_cast(poBuilder); + auto poLSBuilder = static_cast( + poMLSBuilder->value_builder()); + auto poPointBuilder = static_cast( + poLSBuilder->value_builder()); + GET_XYZM_STRUCT_FIELD_BUILDERS_FROM(poPointBuilder); + + OGR_ARROW_RETURN_OGRERR_NOT_OK(poMLSBuilder->Append()); + for (const auto *poLS : *poMLS) + { + OGR_ARROW_RETURN_OGRERR_NOT_OK(poLSBuilder->Append()); + OGR_ARROW_PROPAGATE_OGRERR( + GeoArrowLineBuilder(poLS, poPointBuilder, poXBuilder, + poYBuilder, poZBuilder, poMBuilder)); + } + break; + } + + case OGRArrowGeomEncoding::GEOARROW_FSL_MULTIPOLYGON: + { + const auto poMPoly = poGeom->toMultiPolygon(); + auto poMPolyBuilder = static_cast(poBuilder); + auto poPolyBuilder = static_cast( + poMPolyBuilder->value_builder()); + auto poRingBuilder = static_cast( + poPolyBuilder->value_builder()); + auto poPointBuilder = static_cast( + poRingBuilder->value_builder()); + auto poValueBuilder = static_cast( + poPointBuilder->value_builder()); + OGR_ARROW_RETURN_OGRERR_NOT_OK(poMPolyBuilder->Append()); + for (const auto *poPolygon : *poMPoly) + { + OGR_ARROW_RETURN_OGRERR_NOT_OK(poPolyBuilder->Append()); + for (const auto *poRing : *poPolygon) { - OGR_ARROW_RETURN_OGRERR_NOT_OK(poPointBuilder->Append()); - OGR_ARROW_RETURN_OGRERR_NOT_OK( - poValueBuilder->Append(poRing->getX(j))); - OGR_ARROW_RETURN_OGRERR_NOT_OK( - poValueBuilder->Append(poRing->getY(j))); - if (poGeom->Is3D()) - OGR_ARROW_RETURN_OGRERR_NOT_OK( - poValueBuilder->Append(poRing->getZ(j))); - if (poGeom->IsMeasured()) - OGR_ARROW_RETURN_OGRERR_NOT_OK( - poValueBuilder->Append(poRing->getM(j))); + OGR_ARROW_RETURN_OGRERR_NOT_OK(poRingBuilder->Append()); + OGR_ARROW_PROPAGATE_OGRERR(GeoArrowLineBuilder( + poRing, poPointBuilder, poValueBuilder, poValueBuilder, + bHasZ ? poValueBuilder : nullptr, + bHasM ? poValueBuilder : nullptr)); } } + break; + } + + case OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTIPOLYGON: + { + const auto poMPoly = poGeom->toMultiPolygon(); + auto poMPolyBuilder = static_cast(poBuilder); + auto poPolyBuilder = static_cast( + poMPolyBuilder->value_builder()); + auto poRingBuilder = static_cast( + poPolyBuilder->value_builder()); + auto poPointBuilder = static_cast( + poRingBuilder->value_builder()); + GET_XYZM_STRUCT_FIELD_BUILDERS_FROM(poPointBuilder); + + OGR_ARROW_RETURN_OGRERR_NOT_OK(poMPolyBuilder->Append()); + for (const auto *poPolygon : *poMPoly) + { + OGR_ARROW_RETURN_OGRERR_NOT_OK(poPolyBuilder->Append()); + for (const auto *poRing : *poPolygon) + { + OGR_ARROW_RETURN_OGRERR_NOT_OK(poRingBuilder->Append()); + OGR_ARROW_PROPAGATE_OGRERR(GeoArrowLineBuilder( + poRing, poPointBuilder, poXBuilder, poYBuilder, + poZBuilder, poMBuilder)); + } + } + break; + } + + case OGRArrowGeomEncoding::GEOARROW_FSL_GENERIC: + case OGRArrowGeomEncoding::GEOARROW_STRUCT_GENERIC: + { + CPLAssert(false); + break; } - } - else - { - CPLAssert(false); } return OGRERR_NONE; diff --git a/ogr/ogrsf_frmts/parquet/ogr_parquet.h b/ogr/ogrsf_frmts/parquet/ogr_parquet.h index 38ad11baf06f..0b36c7b914c1 100644 --- a/ogr/ogrsf_frmts/parquet/ogr_parquet.h +++ b/ogr/ogrsf_frmts/parquet/ogr_parquet.h @@ -83,10 +83,16 @@ class OGRParquetLayer final : public OGRParquetLayerBase std::vector> m_apoArrowDataTypes{}; // .size() == field ocunt std::vector m_anMapFieldIndexToParquetColumn{}; - std::vector m_anMapGeomFieldIndexToParquetColumn{}; + std::vector> m_anMapGeomFieldIndexToParquetColumns{}; bool m_bHasMissingMappingToParquet = false; - std::vector m_anSelectedGroupsStartFeatureIdx{}; + //! Contains pairs of (selected feature idx, total feature idx) break points. + std::vector> m_asFeatureIdxRemapping{}; + //! Iterator over m_asFeatureIdxRemapping + std::vector>::iterator + m_oFeatureIdxRemappingIter{}; + //! Feature index among the potentially restricted set of selected row gropus + int64_t m_nFeatureIdxSelected = 0; std::vector m_anRequestedParquetColumns{}; // only valid when // m_bIgnoredFields is set #ifdef DEBUG @@ -140,6 +146,8 @@ class OGRParquetLayer final : public OGRParquetLayerBase bool FastGetExtent(int iGeomField, OGREnvelope *psExtent) const override; + void IncrFeatureIdx() override; + public: OGRParquetLayer(OGRParquetDataset *poDS, const char *pszLayerName, std::unique_ptr &&arrow_reader, diff --git a/ogr/ogrsf_frmts/parquet/ogrparquetdriver.cpp b/ogr/ogrsf_frmts/parquet/ogrparquetdriver.cpp index 254bd1ececd2..5756ced588fd 100644 --- a/ogr/ogrsf_frmts/parquet/ogrparquetdriver.cpp +++ b/ogr/ogrsf_frmts/parquet/ogrparquetdriver.cpp @@ -838,10 +838,14 @@ void OGRParquetDriver::InitMetadata() CPLAddXMLAttributeAndValue(psOption, "description", "Encoding of geometry columns"); CPLAddXMLAttributeAndValue(psOption, "default", "WKB"); - for (const char *pszEncoding : {"WKB", "WKT", "GEOARROW"}) + for (const char *pszEncoding : + {"WKB", "WKT", "GEOARROW", "GEOARROW_INTERLEAVED"}) { auto poValueNode = CPLCreateXMLNode(psOption, CXT_Element, "Value"); CPLCreateXMLNode(poValueNode, CXT_Text, pszEncoding); + if (EQUAL(pszEncoding, "GEOARROW")) + CPLAddXMLAttributeAndValue(poValueNode, "alias", + "GEOARROW_STRUCT"); } } diff --git a/ogr/ogrsf_frmts/parquet/ogrparquetlayer.cpp b/ogr/ogrsf_frmts/parquet/ogrparquetlayer.cpp index 9aa49dfd7026..18838645cea3 100644 --- a/ogr/ogrsf_frmts/parquet/ogrparquetlayer.cpp +++ b/ogr/ogrsf_frmts/parquet/ogrparquetlayer.cpp @@ -496,6 +496,8 @@ OGRParquetLayer::OGRParquetLayer( EstablishFeatureDefn(); CPLAssert(static_cast(m_aeGeomEncoding.size()) == m_poFeatureDefn->GetGeomFieldCount()); + + m_oFeatureIdxRemappingIter = m_asFeatureIdxRemapping.begin(); } /************************************************************************/ @@ -656,10 +658,33 @@ void OGRParquetLayer::EstablishFeatureDefn() oMapParquetColumnNameToIdx); } - m_anMapGeomFieldIndexToParquetColumn.push_back( - bParquetColValid ? iParquetCol : -1); - if (bParquetColValid) - iParquetCol++; + if (bParquetColValid && + (field->type()->id() == arrow::Type::STRUCT || + field->type()->id() == arrow::Type::LIST)) + { + // GeoArrow types + std::vector anParquetCols; + for (const auto &iterParquetCols : oMapParquetColumnNameToIdx) + { + if (STARTS_WITH( + iterParquetCols.first.c_str(), + std::string(field->name()).append(".").c_str())) + { + iParquetCol = + std::max(iParquetCol, iterParquetCols.second); + anParquetCols.push_back(iterParquetCols.second); + } + } + m_anMapGeomFieldIndexToParquetColumns.push_back(anParquetCols); + ++iParquetCol; + } + else + { + m_anMapGeomFieldIndexToParquetColumns.push_back( + {bParquetColValid ? iParquetCol : -1}); + if (bParquetColValid) + iParquetCol++; + } } else { @@ -674,7 +699,7 @@ void OGRParquetLayer::EstablishFeatureDefn() m_poFeatureDefn->GetFieldCount()); CPLAssert(static_cast(m_anMapGeomFieldIndexToArrowColumn.size()) == m_poFeatureDefn->GetGeomFieldCount()); - CPLAssert(static_cast(m_anMapGeomFieldIndexToParquetColumn.size()) == + CPLAssert(static_cast(m_anMapGeomFieldIndexToParquetColumns.size()) == m_poFeatureDefn->GetGeomFieldCount()); if (!fields.empty()) @@ -1173,6 +1198,13 @@ void OGRParquetLayer::ResetReading() m_poRecordBatchReader.reset(); } OGRParquetLayerBase::ResetReading(); + m_oFeatureIdxRemappingIter = m_asFeatureIdxRemapping.begin(); + m_nFeatureIdxSelected = 0; + if (!m_asFeatureIdxRemapping.empty()) + { + m_nFeatureIdx = m_oFeatureIdxRemappingIter->second; + ++m_oFeatureIdxRemappingIter; + } } /************************************************************************/ @@ -1279,6 +1311,25 @@ static IsConstraintPossibleRes IsConstraintPossible(int nOperation, T v, T min, return IsConstraintPossibleRes::YES; } +/************************************************************************/ +/* IncrFeatureIdx() */ +/************************************************************************/ + +void OGRParquetLayer::IncrFeatureIdx() +{ + ++m_nFeatureIdxSelected; + ++m_nFeatureIdx; + if (m_iFIDArrowColumn < 0 && !m_asFeatureIdxRemapping.empty() && + m_oFeatureIdxRemappingIter != m_asFeatureIdxRemapping.end()) + { + if (m_nFeatureIdxSelected == m_oFeatureIdxRemappingIter->first) + { + m_nFeatureIdx = m_oFeatureIdxRemappingIter->second; + ++m_oFeatureIdxRemappingIter; + } + } +} + /************************************************************************/ /* ReadNextBatch() */ /************************************************************************/ @@ -1299,7 +1350,7 @@ bool OGRParquetLayer::ReadNextBatch() if (m_poRecordBatchReader == nullptr) { - m_anSelectedGroupsStartFeatureIdx.clear(); + m_asFeatureIdxRemapping.clear(); bool bIterateEverything = false; std::vector anSelectedGroups; @@ -1311,8 +1362,29 @@ bool OGRParquetLayer::ReadNextBatch() m_oMapGeomFieldIndexToGeomColBBOXParquet.end() && CPLTestBool(CPLGetConfigOption( ("OGR_" + GetDriverUCName() + "_USE_BBOX").c_str(), "YES"))); - - if (m_asAttributeFilterConstraints.empty() && !bUSEBBOXFields) + const bool bIsGeoArrowStruct = + (m_iGeomFieldFilter >= 0 && + m_iGeomFieldFilter < static_cast(m_aeGeomEncoding.size()) && + m_iGeomFieldFilter < + static_cast( + m_anMapGeomFieldIndexToParquetColumns.size()) && + m_anMapGeomFieldIndexToParquetColumns[m_iGeomFieldFilter].size() >= + 2 && + (m_aeGeomEncoding[m_iGeomFieldFilter] == + OGRArrowGeomEncoding::GEOARROW_STRUCT_POINT || + m_aeGeomEncoding[m_iGeomFieldFilter] == + OGRArrowGeomEncoding::GEOARROW_STRUCT_LINESTRING || + m_aeGeomEncoding[m_iGeomFieldFilter] == + OGRArrowGeomEncoding::GEOARROW_STRUCT_POLYGON || + m_aeGeomEncoding[m_iGeomFieldFilter] == + OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTIPOINT || + m_aeGeomEncoding[m_iGeomFieldFilter] == + OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTILINESTRING || + m_aeGeomEncoding[m_iGeomFieldFilter] == + OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTIPOLYGON)); + + if (m_asAttributeFilterConstraints.empty() && !bUSEBBOXFields && + !(bIsGeoArrowStruct && m_poFilterGeom)) { bIterateEverything = true; } @@ -1328,7 +1400,49 @@ bool OGRParquetLayer::ReadNextBatch() OGRFieldType eType = OFTMaxType; OGRFieldSubType eSubType = OFSTNone; std::string osMinTmp, osMaxTmp; - GIntBig nFeatureIdx = 0; + int64_t nFeatureIdxSelected = 0; + int64_t nFeatureIdxTotal = 0; + + int iXMinField = -1; + int iYMinField = -1; + int iXMaxField = -1; + int iYMaxField = -1; + + if (bIsGeoArrowStruct) + { + const auto metadata = + m_poArrowReader->parquet_reader()->metadata(); + const auto poParquetSchema = metadata->schema(); + for (int iParquetCol : + m_anMapGeomFieldIndexToParquetColumns[m_iGeomFieldFilter]) + { + const auto parquetColumn = + poParquetSchema->Column(iParquetCol); + const auto parquetColumnName = + parquetColumn->path()->ToDotString(); + if (parquetColumnName.size() > 2 && + parquetColumnName.find(".x") == + parquetColumnName.size() - 2) + { + iXMinField = iParquetCol; + iXMaxField = iParquetCol; + } + else if (parquetColumnName.size() > 2 && + parquetColumnName.find(".y") == + parquetColumnName.size() - 2) + { + iYMinField = iParquetCol; + iYMaxField = iParquetCol; + } + } + } + else if (bUSEBBOXFields) + { + iXMinField = oIterToGeomColBBOX->second.iParquetXMin; + iYMinField = oIterToGeomColBBOX->second.iParquetYMin; + iXMaxField = oIterToGeomColBBOX->second.iParquetXMax; + iYMaxField = oIterToGeomColBBOX->second.iParquetYMax; + } for (int iRowGroup = 0; iRowGroup < nNumGroups && !bIterateEverything; ++iRowGroup) @@ -1337,12 +1451,13 @@ bool OGRParquetLayer::ReadNextBatch() auto poRowGroup = GetReader()->parquet_reader()->RowGroup(iRowGroup); - if (bUSEBBOXFields) + if (iXMinField >= 0 && iYMinField >= 0 && iXMaxField >= 0 && + iYMaxField >= 0) { - if (GetMinMaxForParquetCol( - iRowGroup, oIterToGeomColBBOX->second.iParquetXMin, - nullptr, true, sMin, bFoundMin, false, sMax, - bFoundMax, eType, eSubType, osMinTmp, osMaxTmp) && + if (GetMinMaxForParquetCol(iRowGroup, iXMinField, nullptr, + true, sMin, bFoundMin, false, + sMax, bFoundMax, eType, eSubType, + osMinTmp, osMaxTmp) && bFoundMin && eType == OFTReal) { const double dfGroupMinX = sMin.Real; @@ -1351,11 +1466,9 @@ bool OGRParquetLayer::ReadNextBatch() bSelectGroup = false; } else if (GetMinMaxForParquetCol( - iRowGroup, - oIterToGeomColBBOX->second.iParquetYMin, - nullptr, true, sMin, bFoundMin, false, - sMax, bFoundMax, eType, eSubType, osMinTmp, - osMaxTmp) && + iRowGroup, iYMinField, nullptr, true, sMin, + bFoundMin, false, sMax, bFoundMax, eType, + eSubType, osMinTmp, osMaxTmp) && bFoundMin && eType == OFTReal) { const double dfGroupMinY = sMin.Real; @@ -1364,12 +1477,9 @@ bool OGRParquetLayer::ReadNextBatch() bSelectGroup = false; } else if (GetMinMaxForParquetCol( - iRowGroup, - oIterToGeomColBBOX->second - .iParquetXMax, - nullptr, false, sMin, bFoundMin, true, - sMax, bFoundMax, eType, eSubType, - osMinTmp, osMaxTmp) && + iRowGroup, iXMaxField, nullptr, false, + sMin, bFoundMin, true, sMax, bFoundMax, + eType, eSubType, osMinTmp, osMaxTmp) && bFoundMax && eType == OFTReal) { const double dfGroupMaxX = sMax.Real; @@ -1378,12 +1488,10 @@ bool OGRParquetLayer::ReadNextBatch() bSelectGroup = false; } else if (GetMinMaxForParquetCol( - iRowGroup, - oIterToGeomColBBOX->second - .iParquetYMax, - nullptr, false, sMin, bFoundMin, - true, sMax, bFoundMax, eType, - eSubType, osMinTmp, osMaxTmp) && + iRowGroup, iYMaxField, nullptr, + false, sMin, bFoundMin, true, sMax, + bFoundMax, eType, eSubType, + osMinTmp, osMaxTmp) && bFoundMax && eType == OFTReal) { const double dfGroupMaxY = sMax.Real; @@ -1413,9 +1521,9 @@ bool OGRParquetLayer::ReadNextBatch() if (iOGRField == OGR_FID_INDEX && m_iFIDParquetColumn < 0) { - sMin.Integer64 = nFeatureIdx; + sMin.Integer64 = nFeatureIdxTotal; sMax.Integer64 = - nFeatureIdx + + nFeatureIdxTotal + poRowGroup->metadata()->num_rows() - 1; eType = OFTInteger64; } @@ -1572,26 +1680,35 @@ bool OGRParquetLayer::ReadNextBatch() if (bSelectGroup) { // CPLDebug("PARQUET", "Selecting row group %d", iRowGroup); - m_anSelectedGroupsStartFeatureIdx.push_back(nFeatureIdx); + m_asFeatureIdxRemapping.emplace_back( + std::make_pair(nFeatureIdxSelected, nFeatureIdxTotal)); anSelectedGroups.push_back(iRowGroup); + nFeatureIdxSelected += poRowGroup->metadata()->num_rows(); } - nFeatureIdx += poRowGroup->metadata()->num_rows(); + nFeatureIdxTotal += poRowGroup->metadata()->num_rows(); } } if (bIterateEverything) { - m_anSelectedGroupsStartFeatureIdx.clear(); + m_asFeatureIdxRemapping.clear(); + m_oFeatureIdxRemappingIter = m_asFeatureIdxRemapping.begin(); if (!CreateRecordBatchReader(0)) return false; } else { + m_oFeatureIdxRemappingIter = m_asFeatureIdxRemapping.begin(); if (anSelectedGroups.empty()) { return false; } + CPLDebug("PARQUET", "%d/%d row groups selected", + int(anSelectedGroups.size()), + m_poArrowReader->num_row_groups()); + m_nFeatureIdx = m_oFeatureIdxRemappingIter->second; + ++m_oFeatureIdxRemappingIter; if (!CreateRecordBatchReader(anSelectedGroups)) { return false; @@ -1624,13 +1741,6 @@ bool OGRParquetLayer::ReadNextBatch() m_poBatch.reset(); return false; } - if (!m_anSelectedGroupsStartFeatureIdx.empty()) - { - CPLAssert( - m_iRecordBatch < - static_cast(m_anSelectedGroupsStartFeatureIdx.size())); - m_nFeatureIdx = m_anSelectedGroupsStartFeatureIdx[m_iRecordBatch]; - } } while (poNextBatch->num_rows() == 0); SetBatch(poNextBatch); @@ -1814,12 +1924,14 @@ OGRErr OGRParquetLayer::SetIgnoredFields(const char **papszFields) { if (!m_poFeatureDefn->GetGeomFieldDefn(i)->IsIgnored()) { - const int iParquetCol = - m_anMapGeomFieldIndexToParquetColumn[i]; - CPLAssert(iParquetCol >= 0); + const auto &anVals = + m_anMapGeomFieldIndexToParquetColumns[i]; + CPLAssert(!anVals.empty() && anVals[0] >= 0); + m_anRequestedParquetColumns.insert( + m_anRequestedParquetColumns.end(), anVals.begin(), + anVals.end()); m_anMapGeomFieldIndexToArrayIndex.push_back(nBatchColumns); nBatchColumns++; - m_anRequestedParquetColumns.push_back(iParquetCol); auto oIter = m_oMapGeomFieldIndexToGeomColBBOX.find(i); const auto oIterParquet = @@ -1828,11 +1940,32 @@ OGRErr OGRParquetLayer::SetIgnoredFields(const char **papszFields) oIterParquet != m_oMapGeomFieldIndexToGeomColBBOXParquet.end()) { - oIter->second.iArrayIdx = nBatchColumns++; - m_anRequestedParquetColumns.insert( - m_anRequestedParquetColumns.end(), - oIterParquet->second.anParquetCols.begin(), - oIterParquet->second.anParquetCols.end()); + const bool bIsGeoArrowStruct = + (m_aeGeomEncoding[i] == + OGRArrowGeomEncoding::GEOARROW_STRUCT_POINT || + m_aeGeomEncoding[i] == + OGRArrowGeomEncoding:: + GEOARROW_STRUCT_LINESTRING || + m_aeGeomEncoding[i] == + OGRArrowGeomEncoding:: + GEOARROW_STRUCT_POLYGON || + m_aeGeomEncoding[i] == + OGRArrowGeomEncoding:: + GEOARROW_STRUCT_MULTIPOINT || + m_aeGeomEncoding[i] == + OGRArrowGeomEncoding:: + GEOARROW_STRUCT_MULTILINESTRING || + m_aeGeomEncoding[i] == + OGRArrowGeomEncoding:: + GEOARROW_STRUCT_MULTIPOLYGON); + if (!bIsGeoArrowStruct) + { + oIter->second.iArrayIdx = nBatchColumns++; + m_anRequestedParquetColumns.insert( + m_anRequestedParquetColumns.end(), + oIterParquet->second.anParquetCols.begin(), + oIterParquet->second.anParquetCols.end()); + } } } else diff --git a/ogr/ogrsf_frmts/parquet/ogrparquetwriterlayer.cpp b/ogr/ogrsf_frmts/parquet/ogrparquetwriterlayer.cpp index b84be7a04fd4..eac58831f317 100644 --- a/ogr/ogrsf_frmts/parquet/ogrparquetwriterlayer.cpp +++ b/ogr/ogrsf_frmts/parquet/ogrparquetwriterlayer.cpp @@ -366,8 +366,24 @@ bool OGRParquetWriterLayer::SetOptions(CSLConstList papszOptions, m_eGeomEncoding = OGRArrowGeomEncoding::WKB; else if (EQUAL(pszGeomEncoding, "WKT")) m_eGeomEncoding = OGRArrowGeomEncoding::WKT; - else if (EQUAL(pszGeomEncoding, "GEOARROW")) - m_eGeomEncoding = OGRArrowGeomEncoding::GEOARROW_GENERIC; + else if (EQUAL(pszGeomEncoding, "GEOARROW_INTERLEAVED")) + { + static bool bHasWarned = false; + if (!bHasWarned) + { + bHasWarned = true; + CPLError( + CE_Warning, CPLE_AppDefined, + "Use of GEOMETRY_ENCODING=GEOARROW_INTERLEAVED is not " + "recommended. " + "GeoParquet 1.1 uses GEOMETRY_ENCODING=GEOARROW (struct) " + "instead."); + } + m_eGeomEncoding = OGRArrowGeomEncoding::GEOARROW_FSL_GENERIC; + } + else if (EQUAL(pszGeomEncoding, "GEOARROW") || + EQUAL(pszGeomEncoding, "GEOARROW_STRUCT")) + m_eGeomEncoding = OGRArrowGeomEncoding::GEOARROW_STRUCT_GENERIC; else { CPLError(CE_Failure, CPLE_NotSupported, @@ -395,10 +411,12 @@ bool OGRParquetWriterLayer::SetOptions(CSLConstList papszOptions, m_poFeatureDefn->SetGeomType(eGType); auto eGeomEncoding = m_eGeomEncoding; - if (eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_GENERIC) + if (eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_FSL_GENERIC || + eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_STRUCT_GENERIC) { - eGeomEncoding = GetPreciseArrowGeomEncoding(eGType); - if (eGeomEncoding == OGRArrowGeomEncoding::GEOARROW_GENERIC) + const auto eEncodingType = eGeomEncoding; + eGeomEncoding = GetPreciseArrowGeomEncoding(eEncodingType, eGType); + if (eGeomEncoding == eEncodingType) return false; } m_aeGeomEncoding.push_back(eGeomEncoding); @@ -634,7 +652,11 @@ std::string OGRParquetWriterLayer::GetGeoMetadata() const CPLTestBool(CPLGetConfigOption("OGR_PARQUET_WRITE_GEO", "YES"))) { CPLJSONObject oRoot; - oRoot.Add("version", "1.0.0"); + oRoot.Add("version", + m_eGeomEncoding == + OGRArrowGeomEncoding::GEOARROW_STRUCT_GENERIC + ? "1.1.0" + : "1.0.0"); oRoot.Add("primary_column", m_poFeatureDefn->GetGeomFieldDefn(0)->GetNameRef()); CPLJSONObject oColumns; diff --git a/swig/python/gdal-utils/osgeo_utils/samples/validate_geoparquet.py b/swig/python/gdal-utils/osgeo_utils/samples/validate_geoparquet.py index 6d4aa2f1b0e5..f03e6df16df5 100644 --- a/swig/python/gdal-utils/osgeo_utils/samples/validate_geoparquet.py +++ b/swig/python/gdal-utils/osgeo_utils/samples/validate_geoparquet.py @@ -181,8 +181,7 @@ def _check(self): if not self.local_schema: return - if self.local_schema: - schema_j = json.loads(open(self.local_schema, "rb").read()) + version = None else: version = geo_j["version"] if not isinstance(version, str): @@ -190,7 +189,15 @@ def _check(self): "'geo[\"version\"]' is not a string. Value of 'geo' = '%s'" % geo ) - schema_url = f"https://github.com/opengeospatial/geoparquet/releases/download/v{version}/schema.json" + if self.local_schema: + schema_j = json.loads(open(self.local_schema, "rb").read()) + else: + # FIXME: Remove that temporary hack once GeoParquet 1.1 schema is released + if version == "1.1.0": + schema_url = "https://github.com/opengeospatial/geoparquet/releases/download/v1.0.0/schema.json" + else: + schema_url = f"https://github.com/opengeospatial/geoparquet/releases/download/v{version}/schema.json" + if schema_url not in geoparquet_schemas: import urllib @@ -210,6 +217,16 @@ def _check(self): schema_j = geoparquet_schemas[schema_url] + # FIXME: Remove that temporary hack once GeoParquet 1.1 schema is released + if version == "1.1.0": + schema_j["properties"]["version"] = {"const": "1.1.0", "type": "string"} + schema_j["properties"]["columns"]["patternProperties"][".+"]["properties"][ + "encoding" + ] = { + "type": "string", + "pattern": "^(WKB|point|linestring|polygon|multipoint|multilinestring|multipolygon)$", + } + try: self._validate(schema_j, geo_j) except Exception as e: @@ -313,7 +330,100 @@ def _check_column_metadata(self, column_name, column_def): f"{geometry_type} is declared several times in geometry_types[]" ) + def _check_data_with_high_level_ogr_api(self, lyr, columns): + """Use for GeoArrow validation""" + + if gdal.VersionInfo() < "3090000": + raise Exception("GDAL 3.9 or later required for GeoArrow data validation") + + lyr.SetIgnoredFields( + [ + lyr.GetLayerDefn().GetFieldDefn(i).GetName() + for i in range(lyr.GetLayerDefn().GetFieldCount()) + ] + ) + + list_of_set_geometry_types = [] + orientations = [] + encodings = [] + assert len(columns) == lyr.GetLayerDefn().GetGeomFieldCount() + for i in range(lyr.GetLayerDefn().GetGeomFieldCount()): + column_def = columns[lyr.GetLayerDefn().GetGeomFieldDefn(i).GetName()] + + encodings.append(column_def["encoding"]) + + if "geometry_types" in column_def: + geometry_types = column_def["geometry_types"] + set_geometry_types = set(geometry_types) + else: + set_geometry_types = set() + list_of_set_geometry_types.append(set_geometry_types) + + if "orientation" in column_def: + orientation = column_def["orientation"] + else: + orientation = None + orientations.append(orientation) + + map_ogr_geom_type_to_geoarrow_encoding = { + ogr.wkbPoint: "point", + ogr.wkbLineString: "linestring", + ogr.wkbPolygon: "polygon", + ogr.wkbMultiPoint: "multipoint", + ogr.wkbMultiLineString: "multilinestring", + ogr.wkbMultiPolygon: "multipolygon", + } + + for row, f in enumerate(lyr): + for i in range(lyr.GetLayerDefn().GetGeomFieldCount()): + g = f.GetGeomFieldRef(i) + if g: + ogr_geom_type = g.GetGeometryType() + if ogr_geom_type not in map_ogr_geom_type_to_geoparquet: + self._error( + f"Geometry at row {row} is of unexpected type for GeoParquet: %s" + % g.GetGeometryName() + ) + elif list_of_set_geometry_types[i]: + geoparquet_geom_type = map_ogr_geom_type_to_geoparquet[ + ogr_geom_type + ] + if geoparquet_geom_type not in list_of_set_geometry_types[i]: + self._error( + f"Geometry at row {row} is of type {geoparquet_geom_type}, but not listed in geometry_types[]" + ) + + ogr_flat_geom_type = ogr.GT_Flatten(ogr_geom_type) + if ogr_flat_geom_type not in map_ogr_geom_type_to_geoarrow_encoding: + self._error( + f"Geometry at row {row} is of unexpected type for a GeoArrow encoding of GeoParquet: %s" + % g.GetGeometryName() + ) + elif ( + map_ogr_geom_type_to_geoarrow_encoding[ogr_flat_geom_type] + != encodings[i] + ): + self._error( + f"Geometry at row {row} is a %s but does not match the declared encoding of %s" + % (ogr.GeometryTypeToName(ogr_geom_type), encodings[i]) + ) + + if orientations[i] == "counterclockwise": + self._check_counterclockwise(g, row) + def _check_data(self, lyr, columns): + + # For non-WKB encoding, just use the high level OGR API + use_high_level_ogr_api = False + for _, column_def in columns.items(): + if column_def["encoding"] != "WKB": + use_high_level_ogr_api = True + break + + if use_high_level_ogr_api: + self._check_data_with_high_level_ogr_api(lyr, columns) + return + lyr.SetIgnoredFields( [ lyr.GetLayerDefn().GetFieldDefn(i).GetName()