From 5982e637b5991bf33f0cbdfcbf0a305dabcacf0b Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Sat, 30 Mar 2024 23:53:59 +0100 Subject: [PATCH] Arrow/Parquet: optimize spatial filtering on GeoArrow struct encoding when there is no bbox column --- autotest/ogr/ogr_parquet.py | 38 +- doc/source/drivers/vector/parquet.rst | 5 +- .../arrow_common/ograrrowlayer.hpp | 700 +++++++++++++++--- ogr/ogrsf_frmts/parquet/ogrparquetlayer.cpp | 138 +++- 4 files changed, 739 insertions(+), 142 deletions(-) diff --git a/autotest/ogr/ogr_parquet.py b/autotest/ogr/ogr_parquet.py index 774a78b567b7..676e31b38f9b 100755 --- a/autotest/ogr/ogr_parquet.py +++ b/autotest/ogr/ogr_parquet.py @@ -3511,8 +3511,11 @@ def check_file(filename): ], ) @pytest.mark.parametrize("check_with_pyarrow", [True, False]) +@pytest.mark.parametrize("covering_bbox", [True, False]) @gdaltest.enable_exceptions() -def test_ogr_parquet_geoarrow(tmp_vsimem, tmp_path, wkt, check_with_pyarrow): +def test_ogr_parquet_geoarrow( + tmp_vsimem, tmp_path, wkt, check_with_pyarrow, covering_bbox +): geom = ogr.CreateGeometryFromWkt(wkt) @@ -3525,7 +3528,12 @@ def test_ogr_parquet_geoarrow(tmp_vsimem, tmp_path, wkt, check_with_pyarrow): ds = ogr.GetDriverByName("Parquet").CreateDataSource(filename) lyr = ds.CreateLayer( - "test", geom_type=geom.GetGeometryType(), options=["GEOMETRY_ENCODING=GEOARROW"] + "test", + geom_type=geom.GetGeometryType(), + options=[ + "GEOMETRY_ENCODING=GEOARROW", + "WRITE_COVERING_BBOX=" + ("YES" if covering_bbox else "NO"), + ], ) lyr.CreateField(ogr.FieldDefn("foo")) @@ -3598,3 +3606,29 @@ def check(lyr): lyr = ds.GetLayer(0) lyr.SetIgnoredFields(["foo"]) check(lyr) + + ds = ogr.Open(filename) + lyr = ds.GetLayer(0) + minx, maxx, miny, maxy = geom.GetEnvelope() + + lyr.SetSpatialFilter(geom) + assert lyr.GetFeatureCount() == (3 if geom.GetGeometryCount() > 1 else 2) + + lyr.SetSpatialFilterRect(maxx + 1, miny, maxx + 2, maxy) + assert lyr.GetFeatureCount() == 0 + + lyr.SetSpatialFilterRect(minx, maxy + 1, maxx, maxy + 2) + assert lyr.GetFeatureCount() == 0 + + lyr.SetSpatialFilterRect(minx - 2, miny, minx - 1, maxy) + assert lyr.GetFeatureCount() == 0 + + lyr.SetSpatialFilterRect(minx, miny - 2, maxx, miny - 1) + assert lyr.GetFeatureCount() == 0 + if ( + minx != miny + and maxx != maxy + and ogr.GT_Flatten(geom.GetGeometryType()) != ogr.wkbMultiPoint + ): + lyr.SetSpatialFilterRect(minx + 0.1, miny + 0.1, maxx - 0.1, maxy - 0.1) + assert lyr.GetFeatureCount() != 0 diff --git a/doc/source/drivers/vector/parquet.rst b/doc/source/drivers/vector/parquet.rst index e52535cb434f..965b27dd0fe0 100644 --- a/doc/source/drivers/vector/parquet.rst +++ b/doc/source/drivers/vector/parquet.rst @@ -128,7 +128,10 @@ Layer creation options :since: 3.9 Whether to write xmin/ymin/xmax/ymax columns with the bounding box of - geometries. + geometries. Writing the geometry bounding box may help applications to + perform faster spatial filtering. Writing a geometry bounding box is less + necessary for the GeoArrow geometry encoding than for the default WKB, as + implementations may be able to directly use the geometry columns. - .. lco:: SORT_BY_BBOX :choices: YES, NO diff --git a/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp b/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp index 79844ef64a81..bc34bc90b047 100644 --- a/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp +++ b/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp @@ -3938,11 +3938,11 @@ inline OGRFeature *OGRArrowLayer::GetNextRawFeature() iCol = m_anMapGeomFieldIndexToArrowColumn[m_iGeomFieldFilter]; } - OGREnvelope sEnvelopeSkipToNextFeatureDueToBBOX; - const auto SkipToNextFeatureDueToBBOX = - [this, &sEnvelopeSkipToNextFeatureDueToBBOX]() + if (m_poArrayXMinFloat || m_poArrayXMinDouble) { - if (!m_poArrayBBOX || !m_poArrayBBOX->IsNull(m_nIdxInBatch)) + OGREnvelope sEnvelopeSkipToNextFeatureDueToBBOX; + const auto IntersectsBBOX = + [this, &sEnvelopeSkipToNextFeatureDueToBBOX]() { if (m_poArrayXMinFloat && !m_poArrayXMinFloat->IsNull(m_nIdxInBatch)) @@ -3955,7 +3955,7 @@ inline OGRFeature *OGRArrowLayer::GetNextRawFeature() m_poArrayXMaxFloat->Value(m_nIdxInBatch); sEnvelopeSkipToNextFeatureDueToBBOX.MaxY = m_poArrayYMaxFloat->Value(m_nIdxInBatch); - if (!m_sFilterEnvelope.Intersects( + if (m_sFilterEnvelope.Intersects( sEnvelopeSkipToNextFeatureDueToBBOX)) { return true; @@ -3972,46 +3972,60 @@ inline OGRFeature *OGRArrowLayer::GetNextRawFeature() m_poArrayXMaxDouble->Value(m_nIdxInBatch); sEnvelopeSkipToNextFeatureDueToBBOX.MaxY = m_poArrayYMaxDouble->Value(m_nIdxInBatch); - if (!m_sFilterEnvelope.Intersects( + if (m_sFilterEnvelope.Intersects( sEnvelopeSkipToNextFeatureDueToBBOX)) { return true; } } - } - return false; - }; + return false; + }; - if (iCol >= 0 && - m_aeGeomEncoding[m_iGeomFieldFilter] == OGRArrowGeomEncoding::WKB) + while (true) + { + if (!m_poArrayBBOX->IsNull(m_nIdxInBatch) && IntersectsBBOX() && + (m_asAttributeFilterConstraints.empty() || + !SkipToNextFeatureDueToAttributeFilter())) + { + break; + } + + IncrFeatureIdx(); + m_nIdxInBatch++; + if (m_nIdxInBatch == m_poBatch->num_rows()) + { + m_bEOF = !ReadNextBatch(); + if (m_bEOF) + return nullptr; + } + } + } + else if (iCol >= 0 && m_aeGeomEncoding[m_iGeomFieldFilter] == + OGRArrowGeomEncoding::WKB) { CPLAssert(m_poArrayWKB || m_poArrayWKBLarge); OGREnvelope sEnvelope; while (true) { - bool bSkipToNextFeature = false; + bool bMatchBBOX = false; if ((m_poArrayWKB && m_poArrayWKB->IsNull(m_nIdxInBatch)) || (m_poArrayWKBLarge && m_poArrayWKBLarge->IsNull(m_nIdxInBatch))) { - bSkipToNextFeature = true; + // nothing to do } else { - if (m_poArrayXMinFloat || m_poArrayXMinDouble) - { - bSkipToNextFeature = SkipToNextFeatureDueToBBOX(); - } - else if (m_poArrayWKB) + if (m_poArrayWKB) { int out_length = 0; const uint8_t *data = m_poArrayWKB->GetValue(m_nIdxInBatch, &out_length); if (OGRWKBGetBoundingBox(data, out_length, sEnvelope) && - !m_sFilterEnvelope.Intersects(sEnvelope)) + m_sFilterEnvelope.Intersects(sEnvelope)) { - bSkipToNextFeature = true; + bMatchBBOX = true; } } else @@ -4024,18 +4038,14 @@ inline OGRFeature *OGRArrowLayer::GetNextRawFeature() OGRWKBGetBoundingBox(data, static_cast(out_length64), sEnvelope) && - !m_sFilterEnvelope.Intersects(sEnvelope)) + m_sFilterEnvelope.Intersects(sEnvelope)) { - bSkipToNextFeature = true; + bMatchBBOX = true; } } } - if (!bSkipToNextFeature) - { - break; - } - if (!m_asAttributeFilterConstraints.empty() && - !SkipToNextFeatureDueToAttributeFilter()) + if (bMatchBBOX && (m_asAttributeFilterConstraints.empty() || + !SkipToNextFeatureDueToAttributeFilter())) { break; } @@ -4061,128 +4071,598 @@ inline OGRFeature *OGRArrowLayer::GetNextRawFeature() const bool bHasM = CPL_TO_BOOL(OGR_GT_HasM(eGeomType)); const int nDim = 2 + (bHasZ ? 1 : 0) + (bHasM ? 1 : 0); - begin_multipolygon: - auto array = m_poBatchColumns[iCol].get(); - CPLAssert(array->type_id() == arrow::Type::LIST); - auto listOfPartsArray = - static_cast(array); - CPLAssert(listOfPartsArray->values()->type_id() == - arrow::Type::LIST); - auto listOfPartsValues = std::static_pointer_cast( - listOfPartsArray->values()); - CPLAssert(listOfPartsValues->values()->type_id() == - arrow::Type::LIST); - auto listOfRingsValues = std::static_pointer_cast( - listOfPartsValues->values()); - CPLAssert(listOfRingsValues->values()->type_id() == - arrow::Type::FIXED_SIZE_LIST); - auto listOfPointsValues = - std::static_pointer_cast( - listOfRingsValues->values()); - CPLAssert(listOfPointsValues->values()->type_id() == - arrow::Type::DOUBLE); - auto pointValues = std::static_pointer_cast( - listOfPointsValues->values()); + bool bReturnFeature; + do + { + bReturnFeature = false; + auto array = m_poBatchColumns[iCol].get(); + CPLAssert(array->type_id() == arrow::Type::LIST); + auto listOfPartsArray = + static_cast(array); + CPLAssert(listOfPartsArray->values()->type_id() == + arrow::Type::LIST); + auto listOfPartsValues = + std::static_pointer_cast( + listOfPartsArray->values()); + CPLAssert(listOfPartsValues->values()->type_id() == + arrow::Type::LIST); + auto listOfRingsValues = + std::static_pointer_cast( + listOfPartsValues->values()); + CPLAssert(listOfRingsValues->values()->type_id() == + arrow::Type::FIXED_SIZE_LIST); + auto listOfPointsValues = + std::static_pointer_cast( + listOfRingsValues->values()); + CPLAssert(listOfPointsValues->values()->type_id() == + arrow::Type::DOUBLE); + auto pointValues = std::static_pointer_cast( + listOfPointsValues->values()); + + while (true) + { + bool bMatchBBOX = false; + if (!listOfPartsArray->IsNull(m_nIdxInBatch)) + { + OGREnvelope sEnvelope; + const auto nParts = + listOfPartsArray->value_length(m_nIdxInBatch); + const auto nPartOffset = + listOfPartsArray->value_offset(m_nIdxInBatch); + for (auto j = decltype(nParts){0}; j < nParts; j++) + { + const auto nRings = listOfPartsValues->value_length( + nPartOffset + j); + const auto nRingOffset = + listOfPartsValues->value_offset(nPartOffset + + j); + if (nRings >= 1) + { + const auto nPoints = + listOfRingsValues->value_length( + nRingOffset); + const auto nPointOffset = + listOfRingsValues->value_offset( + nRingOffset) * + nDim; + const double *padfRawValue = + pointValues->raw_values() + nPointOffset; + for (auto l = decltype(nPoints){0}; l < nPoints; + ++l) + { + sEnvelope.Merge(padfRawValue[nDim * l], + padfRawValue[nDim * l + 1]); + } + // for bounding box, only the first ring matters + } + } - while (true) + if (nParts != 0 && + m_sFilterEnvelope.Intersects(sEnvelope)) + { + bMatchBBOX = true; + } + } + if (bMatchBBOX && + (m_asAttributeFilterConstraints.empty() || + !SkipToNextFeatureDueToAttributeFilter())) + { + bReturnFeature = true; + break; + } + + IncrFeatureIdx(); + m_nIdxInBatch++; + if (m_nIdxInBatch == m_poBatch->num_rows()) + { + m_bEOF = !ReadNextBatch(); + if (m_bEOF) + return nullptr; + break; + } + } + } while (!bReturnFeature); + } + else if (iCol >= 0 && m_aeGeomEncoding[m_iGeomFieldFilter] == + OGRArrowGeomEncoding::GEOARROW_STRUCT_POINT) + { + bool bReturnFeature; + do + { + bReturnFeature = false; + auto array = m_poBatchColumns[iCol].get(); + CPLAssert(array->type_id() == arrow::Type::STRUCT); + auto pointValues = + static_cast(array); + const auto &fields = pointValues->fields(); + const auto &fieldX = fields[0]; + CPLAssert(fieldX->type_id() == arrow::Type::DOUBLE); + const auto fieldXDouble = + static_cast(fieldX.get()); + const auto &fieldY = fields[1]; + CPLAssert(fieldY->type_id() == arrow::Type::DOUBLE); + const auto fieldYDouble = + static_cast(fieldY.get()); + + while (true) + { + bool bMatchBBOX = false; + if (!array->IsNull(m_nIdxInBatch)) + { + const double dfX = fieldXDouble->Value(m_nIdxInBatch); + const double dfY = fieldYDouble->Value(m_nIdxInBatch); + if (dfX >= m_sFilterEnvelope.MinX && + dfY >= m_sFilterEnvelope.MinY && + dfX <= m_sFilterEnvelope.MaxX && + dfY <= m_sFilterEnvelope.MaxY) + { + bMatchBBOX = true; + } + } + if (bMatchBBOX && + (m_asAttributeFilterConstraints.empty() || + !SkipToNextFeatureDueToAttributeFilter())) + { + bReturnFeature = true; + break; + } + + IncrFeatureIdx(); + m_nIdxInBatch++; + if (m_nIdxInBatch == m_poBatch->num_rows()) + { + m_bEOF = !ReadNextBatch(); + if (m_bEOF) + return nullptr; + break; + } + } + } while (!bReturnFeature); + } + else if (iCol >= 0 && + m_aeGeomEncoding[m_iGeomFieldFilter] == + OGRArrowGeomEncoding::GEOARROW_STRUCT_LINESTRING) + { + bool bReturnFeature; + do { - bool bSkipToNextFeature = false; - if (m_poArrayXMinFloat || m_poArrayXMinDouble) + bReturnFeature = false; + auto array = m_poBatchColumns[iCol].get(); + CPLAssert(array->type_id() == arrow::Type::LIST); + const auto listArray = + static_cast(array); + CPLAssert(listArray->values()->type_id() == + arrow::Type::STRUCT); + auto pointValues = std::static_pointer_cast( + listArray->values()); + const auto &fields = pointValues->fields(); + const auto &fieldX = fields[0]; + CPLAssert(fieldX->type_id() == arrow::Type::DOUBLE); + const auto fieldXDouble = + static_cast(fieldX.get()); + const auto &fieldY = fields[1]; + CPLAssert(fieldY->type_id() == arrow::Type::DOUBLE); + const auto fieldYDouble = + static_cast(fieldY.get()); + + while (true) { - bSkipToNextFeature = SkipToNextFeatureDueToBBOX(); + bool bMatchBBOX = false; + if (!listArray->IsNull(m_nIdxInBatch)) + { + OGREnvelope sEnvelope; + const auto nPoints = + listArray->value_length(m_nIdxInBatch); + const auto nPointOffset = + listArray->value_offset(m_nIdxInBatch); + if (nPoints > 0) + { + const double *padfRawXValue = + fieldXDouble->raw_values() + nPointOffset; + const double *padfRawYValue = + fieldYDouble->raw_values() + nPointOffset; + for (auto l = decltype(nPoints){0}; l < nPoints; + ++l) + { + sEnvelope.Merge(padfRawXValue[l], + padfRawYValue[l]); + } + if (m_sFilterEnvelope.Intersects(sEnvelope)) + { + bMatchBBOX = true; + } + } + } + if (bMatchBBOX && + (m_asAttributeFilterConstraints.empty() || + !SkipToNextFeatureDueToAttributeFilter())) + { + bReturnFeature = true; + break; + } + + IncrFeatureIdx(); + m_nIdxInBatch++; + if (m_nIdxInBatch == m_poBatch->num_rows()) + { + m_bEOF = !ReadNextBatch(); + if (m_bEOF) + return nullptr; + break; + } } - else if (!listOfPartsArray->IsNull(m_nIdxInBatch)) + } while (!bReturnFeature); + } + else if (iCol >= 0 && m_aeGeomEncoding[m_iGeomFieldFilter] == + OGRArrowGeomEncoding::GEOARROW_STRUCT_POLYGON) + { + bool bReturnFeature; + do + { + bReturnFeature = false; + auto array = m_poBatchColumns[iCol].get(); + CPLAssert(array->type_id() == arrow::Type::LIST); + const auto listOfRingsArray = + static_cast(array); + CPLAssert(listOfRingsArray->values()->type_id() == + arrow::Type::LIST); + const auto listOfRingsValues = + std::static_pointer_cast( + listOfRingsArray->values()); + CPLAssert(listOfRingsValues->values()->type_id() == + arrow::Type::STRUCT); + auto pointValues = std::static_pointer_cast( + listOfRingsValues->values()); + const auto &fields = pointValues->fields(); + const auto &fieldX = fields[0]; + CPLAssert(fieldX->type_id() == arrow::Type::DOUBLE); + const auto fieldXDouble = + static_cast(fieldX.get()); + const auto &fieldY = fields[1]; + CPLAssert(fieldY->type_id() == arrow::Type::DOUBLE); + const auto fieldYDouble = + static_cast(fieldY.get()); + + while (true) { - OGREnvelope sEnvelope; - const auto nParts = - listOfPartsArray->value_length(m_nIdxInBatch); - const auto nPartOffset = - listOfPartsArray->value_offset(m_nIdxInBatch); - for (auto j = decltype(nParts){0}; j < nParts; j++) + bool bMatchBBOX = false; + if (!listOfRingsArray->IsNull(m_nIdxInBatch)) { + OGREnvelope sEnvelope; const auto nRings = - listOfPartsValues->value_length(nPartOffset + j); + listOfRingsArray->value_length(m_nIdxInBatch); const auto nRingOffset = - listOfPartsValues->value_offset(nPartOffset + j); + listOfRingsArray->value_offset(m_nIdxInBatch); if (nRings >= 1) { const auto nPoints = listOfRingsValues->value_length(nRingOffset); const auto nPointOffset = - listOfRingsValues->value_offset(nRingOffset) * - nDim; - const double *padfRawValue = - pointValues->raw_values() + nPointOffset; + listOfRingsValues->value_offset(nRingOffset); + const double *padfRawXValue = + fieldXDouble->raw_values() + nPointOffset; + const double *padfRawYValue = + fieldYDouble->raw_values() + nPointOffset; for (auto l = decltype(nPoints){0}; l < nPoints; ++l) { - sEnvelope.Merge(padfRawValue[nDim * l], - padfRawValue[nDim * l + 1]); + sEnvelope.Merge(padfRawXValue[l], + padfRawYValue[l]); } // for bounding box, only the first ring matters + + if (m_sFilterEnvelope.Intersects(sEnvelope)) + { + bMatchBBOX = true; + } } } + if (bMatchBBOX && + (m_asAttributeFilterConstraints.empty() || + !SkipToNextFeatureDueToAttributeFilter())) + { + bReturnFeature = true; + break; + } - if (nParts != 0 && !m_sFilterEnvelope.Intersects(sEnvelope)) + IncrFeatureIdx(); + m_nIdxInBatch++; + if (m_nIdxInBatch == m_poBatch->num_rows()) { - bSkipToNextFeature = true; + m_bEOF = !ReadNextBatch(); + if (m_bEOF) + return nullptr; + break; } } - if (!bSkipToNextFeature) - { - break; - } - if (!m_asAttributeFilterConstraints.empty() && - !SkipToNextFeatureDueToAttributeFilter()) + } while (!bReturnFeature); + } + else if (iCol >= 0 && + m_aeGeomEncoding[m_iGeomFieldFilter] == + OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTIPOINT) + { + bool bReturnFeature; + do + { + bReturnFeature = false; + auto array = m_poBatchColumns[iCol].get(); + CPLAssert(array->type_id() == arrow::Type::LIST); + const auto listArray = + static_cast(array); + CPLAssert(listArray->values()->type_id() == + arrow::Type::STRUCT); + auto pointValues = std::static_pointer_cast( + listArray->values()); + const auto &fields = pointValues->fields(); + const auto &fieldX = fields[0]; + CPLAssert(fieldX->type_id() == arrow::Type::DOUBLE); + const auto fieldXDouble = + static_cast(fieldX.get()); + const auto &fieldY = fields[1]; + CPLAssert(fieldY->type_id() == arrow::Type::DOUBLE); + const auto fieldYDouble = + static_cast(fieldY.get()); + + while (true) { - break; - } + bool bMatchBBOX = false; + if (!listArray->IsNull(m_nIdxInBatch)) + { + const auto nPoints = + listArray->value_length(m_nIdxInBatch); + const auto nPointOffset = + listArray->value_offset(m_nIdxInBatch); + if (nPoints > 0) + { + const double *padfRawXValue = + fieldXDouble->raw_values() + nPointOffset; + const double *padfRawYValue = + fieldYDouble->raw_values() + nPointOffset; + for (auto l = decltype(nPoints){0}; l < nPoints; + ++l) + { + if (padfRawXValue[l] >= + m_sFilterEnvelope.MinX && + padfRawYValue[l] >= + m_sFilterEnvelope.MinY && + padfRawXValue[l] <= + m_sFilterEnvelope.MaxX && + padfRawYValue[l] <= m_sFilterEnvelope.MaxY) + { + bMatchBBOX = true; + break; + } + } + } + } + if (bMatchBBOX && + (m_asAttributeFilterConstraints.empty() || + !SkipToNextFeatureDueToAttributeFilter())) + { + bReturnFeature = true; + break; + } - IncrFeatureIdx(); - m_nIdxInBatch++; - if (m_nIdxInBatch == m_poBatch->num_rows()) - { - m_bEOF = !ReadNextBatch(); - if (m_bEOF) - return nullptr; - goto begin_multipolygon; + IncrFeatureIdx(); + m_nIdxInBatch++; + if (m_nIdxInBatch == m_poBatch->num_rows()) + { + m_bEOF = !ReadNextBatch(); + if (m_bEOF) + return nullptr; + break; + } } - } + } while (!bReturnFeature); } - else if (iCol >= 0) + else if (iCol >= 0 && + m_aeGeomEncoding[m_iGeomFieldFilter] == + OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTILINESTRING) { - auto array = m_poBatchColumns[iCol].get(); - while (true) + bool bReturnFeature; + do { - bool bSkipToNextFeature = false; - if (m_poArrayXMinFloat || m_poArrayXMinDouble) - { - bSkipToNextFeature = SkipToNextFeatureDueToBBOX(); - } - else + bReturnFeature = false; + auto array = m_poBatchColumns[iCol].get(); + CPLAssert(array->type_id() == arrow::Type::LIST); + auto listOfPartsArray = + static_cast(array); + CPLAssert(listOfPartsArray->values()->type_id() == + arrow::Type::LIST); + auto listOfPartsValues = + std::static_pointer_cast( + listOfPartsArray->values()); + CPLAssert(listOfPartsValues->values()->type_id() == + arrow::Type::STRUCT); + auto pointValues = std::static_pointer_cast( + listOfPartsValues->values()); + const auto &fields = pointValues->fields(); + const auto &fieldX = fields[0]; + CPLAssert(fieldX->type_id() == arrow::Type::DOUBLE); + const auto fieldXDouble = + static_cast(fieldX.get()); + const auto &fieldY = fields[1]; + CPLAssert(fieldY->type_id() == arrow::Type::DOUBLE); + const auto fieldYDouble = + static_cast(fieldY.get()); + + while (true) { - auto poGeometry = std::unique_ptr( - ReadGeometry(m_iGeomFieldFilter, array, m_nIdxInBatch)); - if (poGeometry == nullptr || poGeometry->IsEmpty()) + bool bMatchBBOX = false; + if (!listOfPartsArray->IsNull(m_nIdxInBatch)) { - bSkipToNextFeature = true; + const auto nParts = + listOfPartsArray->value_length(m_nIdxInBatch); + const auto nPartOffset = + listOfPartsArray->value_offset(m_nIdxInBatch); + for (auto j = decltype(nParts){0}; + j < nParts && !bMatchBBOX; j++) + { + OGREnvelope sEnvelope; + const auto nPoints = + listOfPartsValues->value_length(nPartOffset + + j); + const auto nPointOffset = + listOfPartsValues->value_offset(nPartOffset + + j); + const double *padfRawXValue = + fieldXDouble->raw_values() + nPointOffset; + const double *padfRawYValue = + fieldYDouble->raw_values() + nPointOffset; + for (auto l = decltype(nPoints){0}; l < nPoints; + ++l) + { + sEnvelope.Merge(padfRawXValue[l], + padfRawYValue[l]); + } + + if (m_sFilterEnvelope.Intersects(sEnvelope)) + { + bMatchBBOX = true; + } + } } - else + if (bMatchBBOX && + (m_asAttributeFilterConstraints.empty() || + !SkipToNextFeatureDueToAttributeFilter())) { - OGREnvelope sEnvelope; - poGeometry->getEnvelope(&sEnvelope); - if (!m_sFilterEnvelope.Intersects(sEnvelope)) + bReturnFeature = true; + break; + } + + IncrFeatureIdx(); + m_nIdxInBatch++; + if (m_nIdxInBatch == m_poBatch->num_rows()) + { + m_bEOF = !ReadNextBatch(); + if (m_bEOF) + return nullptr; + break; + } + } + } while (!bReturnFeature); + } + else if (iCol >= 0 && + m_aeGeomEncoding[m_iGeomFieldFilter] == + OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTIPOLYGON) + { + bool bReturnFeature; + do + { + bReturnFeature = false; + auto array = m_poBatchColumns[iCol].get(); + CPLAssert(array->type_id() == arrow::Type::LIST); + auto listOfPartsArray = + static_cast(array); + CPLAssert(listOfPartsArray->values()->type_id() == + arrow::Type::LIST); + auto listOfPartsValues = + std::static_pointer_cast( + listOfPartsArray->values()); + CPLAssert(listOfPartsValues->values()->type_id() == + arrow::Type::LIST); + auto listOfRingsValues = + std::static_pointer_cast( + listOfPartsValues->values()); + CPLAssert(listOfRingsValues->values()->type_id() == + arrow::Type::STRUCT); + auto pointValues = std::static_pointer_cast( + listOfRingsValues->values()); + const auto &fields = pointValues->fields(); + const auto &fieldX = fields[0]; + CPLAssert(fieldX->type_id() == arrow::Type::DOUBLE); + const auto fieldXDouble = + static_cast(fieldX.get()); + const auto &fieldY = fields[1]; + CPLAssert(fieldY->type_id() == arrow::Type::DOUBLE); + const auto fieldYDouble = + static_cast(fieldY.get()); + + while (true) + { + bool bMatchBBOX = false; + if (!listOfPartsArray->IsNull(m_nIdxInBatch)) + { + const auto nParts = + listOfPartsArray->value_length(m_nIdxInBatch); + const auto nPartOffset = + listOfPartsArray->value_offset(m_nIdxInBatch); + for (auto j = decltype(nParts){0}; + j < nParts && !bMatchBBOX; j++) { - bSkipToNextFeature = true; + OGREnvelope sEnvelope; + const auto nRings = listOfPartsValues->value_length( + nPartOffset + j); + const auto nRingOffset = + listOfPartsValues->value_offset(nPartOffset + + j); + if (nRings >= 1) + { + const auto nPoints = + listOfRingsValues->value_length( + nRingOffset); + const auto nPointOffset = + listOfRingsValues->value_offset( + nRingOffset); + const double *padfRawXValue = + fieldXDouble->raw_values() + nPointOffset; + const double *padfRawYValue = + fieldYDouble->raw_values() + nPointOffset; + for (auto l = decltype(nPoints){0}; l < nPoints; + ++l) + { + sEnvelope.Merge(padfRawXValue[l], + padfRawYValue[l]); + } + + if (m_sFilterEnvelope.Intersects(sEnvelope)) + { + bMatchBBOX = true; + } + // for bounding box, only the first ring matters + } } } + if (bMatchBBOX && + (m_asAttributeFilterConstraints.empty() || + !SkipToNextFeatureDueToAttributeFilter())) + { + bReturnFeature = true; + break; + } + + IncrFeatureIdx(); + m_nIdxInBatch++; + if (m_nIdxInBatch == m_poBatch->num_rows()) + { + m_bEOF = !ReadNextBatch(); + if (m_bEOF) + return nullptr; + break; + } } - if (!bSkipToNextFeature) + } while (!bReturnFeature); + } + else if (iCol >= 0) + { + auto array = m_poBatchColumns[iCol].get(); + while (true) + { + bool bMatchBBOX = false; + + auto poGeometry = std::unique_ptr( + ReadGeometry(m_iGeomFieldFilter, array, m_nIdxInBatch)); + if (poGeometry && !poGeometry->IsEmpty()) { - break; + OGREnvelope sEnvelope; + poGeometry->getEnvelope(&sEnvelope); + if (m_sFilterEnvelope.Intersects(sEnvelope)) + { + bMatchBBOX = true; + } } - if (!m_asAttributeFilterConstraints.empty() && - !SkipToNextFeatureDueToAttributeFilter()) + if (bMatchBBOX && (m_asAttributeFilterConstraints.empty() || + !SkipToNextFeatureDueToAttributeFilter())) { break; } diff --git a/ogr/ogrsf_frmts/parquet/ogrparquetlayer.cpp b/ogr/ogrsf_frmts/parquet/ogrparquetlayer.cpp index 72207ccbd255..18838645cea3 100644 --- a/ogr/ogrsf_frmts/parquet/ogrparquetlayer.cpp +++ b/ogr/ogrsf_frmts/parquet/ogrparquetlayer.cpp @@ -1362,8 +1362,29 @@ bool OGRParquetLayer::ReadNextBatch() m_oMapGeomFieldIndexToGeomColBBOXParquet.end() && CPLTestBool(CPLGetConfigOption( ("OGR_" + GetDriverUCName() + "_USE_BBOX").c_str(), "YES"))); - - if (m_asAttributeFilterConstraints.empty() && !bUSEBBOXFields) + const bool bIsGeoArrowStruct = + (m_iGeomFieldFilter >= 0 && + m_iGeomFieldFilter < static_cast(m_aeGeomEncoding.size()) && + m_iGeomFieldFilter < + static_cast( + m_anMapGeomFieldIndexToParquetColumns.size()) && + m_anMapGeomFieldIndexToParquetColumns[m_iGeomFieldFilter].size() >= + 2 && + (m_aeGeomEncoding[m_iGeomFieldFilter] == + OGRArrowGeomEncoding::GEOARROW_STRUCT_POINT || + m_aeGeomEncoding[m_iGeomFieldFilter] == + OGRArrowGeomEncoding::GEOARROW_STRUCT_LINESTRING || + m_aeGeomEncoding[m_iGeomFieldFilter] == + OGRArrowGeomEncoding::GEOARROW_STRUCT_POLYGON || + m_aeGeomEncoding[m_iGeomFieldFilter] == + OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTIPOINT || + m_aeGeomEncoding[m_iGeomFieldFilter] == + OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTILINESTRING || + m_aeGeomEncoding[m_iGeomFieldFilter] == + OGRArrowGeomEncoding::GEOARROW_STRUCT_MULTIPOLYGON)); + + if (m_asAttributeFilterConstraints.empty() && !bUSEBBOXFields && + !(bIsGeoArrowStruct && m_poFilterGeom)) { bIterateEverything = true; } @@ -1382,6 +1403,47 @@ bool OGRParquetLayer::ReadNextBatch() int64_t nFeatureIdxSelected = 0; int64_t nFeatureIdxTotal = 0; + int iXMinField = -1; + int iYMinField = -1; + int iXMaxField = -1; + int iYMaxField = -1; + + if (bIsGeoArrowStruct) + { + const auto metadata = + m_poArrowReader->parquet_reader()->metadata(); + const auto poParquetSchema = metadata->schema(); + for (int iParquetCol : + m_anMapGeomFieldIndexToParquetColumns[m_iGeomFieldFilter]) + { + const auto parquetColumn = + poParquetSchema->Column(iParquetCol); + const auto parquetColumnName = + parquetColumn->path()->ToDotString(); + if (parquetColumnName.size() > 2 && + parquetColumnName.find(".x") == + parquetColumnName.size() - 2) + { + iXMinField = iParquetCol; + iXMaxField = iParquetCol; + } + else if (parquetColumnName.size() > 2 && + parquetColumnName.find(".y") == + parquetColumnName.size() - 2) + { + iYMinField = iParquetCol; + iYMaxField = iParquetCol; + } + } + } + else if (bUSEBBOXFields) + { + iXMinField = oIterToGeomColBBOX->second.iParquetXMin; + iYMinField = oIterToGeomColBBOX->second.iParquetYMin; + iXMaxField = oIterToGeomColBBOX->second.iParquetXMax; + iYMaxField = oIterToGeomColBBOX->second.iParquetYMax; + } + for (int iRowGroup = 0; iRowGroup < nNumGroups && !bIterateEverything; ++iRowGroup) { @@ -1389,12 +1451,13 @@ bool OGRParquetLayer::ReadNextBatch() auto poRowGroup = GetReader()->parquet_reader()->RowGroup(iRowGroup); - if (bUSEBBOXFields) + if (iXMinField >= 0 && iYMinField >= 0 && iXMaxField >= 0 && + iYMaxField >= 0) { - if (GetMinMaxForParquetCol( - iRowGroup, oIterToGeomColBBOX->second.iParquetXMin, - nullptr, true, sMin, bFoundMin, false, sMax, - bFoundMax, eType, eSubType, osMinTmp, osMaxTmp) && + if (GetMinMaxForParquetCol(iRowGroup, iXMinField, nullptr, + true, sMin, bFoundMin, false, + sMax, bFoundMax, eType, eSubType, + osMinTmp, osMaxTmp) && bFoundMin && eType == OFTReal) { const double dfGroupMinX = sMin.Real; @@ -1403,11 +1466,9 @@ bool OGRParquetLayer::ReadNextBatch() bSelectGroup = false; } else if (GetMinMaxForParquetCol( - iRowGroup, - oIterToGeomColBBOX->second.iParquetYMin, - nullptr, true, sMin, bFoundMin, false, - sMax, bFoundMax, eType, eSubType, osMinTmp, - osMaxTmp) && + iRowGroup, iYMinField, nullptr, true, sMin, + bFoundMin, false, sMax, bFoundMax, eType, + eSubType, osMinTmp, osMaxTmp) && bFoundMin && eType == OFTReal) { const double dfGroupMinY = sMin.Real; @@ -1416,12 +1477,9 @@ bool OGRParquetLayer::ReadNextBatch() bSelectGroup = false; } else if (GetMinMaxForParquetCol( - iRowGroup, - oIterToGeomColBBOX->second - .iParquetXMax, - nullptr, false, sMin, bFoundMin, true, - sMax, bFoundMax, eType, eSubType, - osMinTmp, osMaxTmp) && + iRowGroup, iXMaxField, nullptr, false, + sMin, bFoundMin, true, sMax, bFoundMax, + eType, eSubType, osMinTmp, osMaxTmp) && bFoundMax && eType == OFTReal) { const double dfGroupMaxX = sMax.Real; @@ -1430,12 +1488,10 @@ bool OGRParquetLayer::ReadNextBatch() bSelectGroup = false; } else if (GetMinMaxForParquetCol( - iRowGroup, - oIterToGeomColBBOX->second - .iParquetYMax, - nullptr, false, sMin, bFoundMin, - true, sMax, bFoundMax, eType, - eSubType, osMinTmp, osMaxTmp) && + iRowGroup, iYMaxField, nullptr, + false, sMin, bFoundMin, true, sMax, + bFoundMax, eType, eSubType, + osMinTmp, osMaxTmp) && bFoundMax && eType == OFTReal) { const double dfGroupMaxY = sMax.Real; @@ -1648,6 +1704,9 @@ bool OGRParquetLayer::ReadNextBatch() { return false; } + CPLDebug("PARQUET", "%d/%d row groups selected", + int(anSelectedGroups.size()), + m_poArrowReader->num_row_groups()); m_nFeatureIdx = m_oFeatureIdxRemappingIter->second; ++m_oFeatureIdxRemappingIter; if (!CreateRecordBatchReader(anSelectedGroups)) @@ -1881,11 +1940,32 @@ OGRErr OGRParquetLayer::SetIgnoredFields(const char **papszFields) oIterParquet != m_oMapGeomFieldIndexToGeomColBBOXParquet.end()) { - oIter->second.iArrayIdx = nBatchColumns++; - m_anRequestedParquetColumns.insert( - m_anRequestedParquetColumns.end(), - oIterParquet->second.anParquetCols.begin(), - oIterParquet->second.anParquetCols.end()); + const bool bIsGeoArrowStruct = + (m_aeGeomEncoding[i] == + OGRArrowGeomEncoding::GEOARROW_STRUCT_POINT || + m_aeGeomEncoding[i] == + OGRArrowGeomEncoding:: + GEOARROW_STRUCT_LINESTRING || + m_aeGeomEncoding[i] == + OGRArrowGeomEncoding:: + GEOARROW_STRUCT_POLYGON || + m_aeGeomEncoding[i] == + OGRArrowGeomEncoding:: + GEOARROW_STRUCT_MULTIPOINT || + m_aeGeomEncoding[i] == + OGRArrowGeomEncoding:: + GEOARROW_STRUCT_MULTILINESTRING || + m_aeGeomEncoding[i] == + OGRArrowGeomEncoding:: + GEOARROW_STRUCT_MULTIPOLYGON); + if (!bIsGeoArrowStruct) + { + oIter->second.iArrayIdx = nBatchColumns++; + m_anRequestedParquetColumns.insert( + m_anRequestedParquetColumns.end(), + oIterParquet->second.anParquetCols.begin(), + oIterParquet->second.anParquetCols.end()); + } } } else