From 1cc7cd15bd20c2ad358bc09c754f51f075ff0f8f Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Tue, 16 Apr 2024 12:31:44 +0200 Subject: [PATCH] OSM: properly escape special characters with TAGS_FORMAT=JSON open option Fixes https://github.com/OSGeo/gdal/issues/9673 --- autotest/ogr/data/osm/test_json.osm | 7 ++ autotest/ogr/data/osm/test_json.pbf | Bin 0 -> 204 bytes autotest/ogr/ogr_osm.py | 18 +++++ ogr/ogrsf_frmts/osm/ogr_osm.h | 2 +- ogr/ogrsf_frmts/osm/ogrosmlayer.cpp | 119 +++++++++++++++------------- 5 files changed, 90 insertions(+), 56 deletions(-) create mode 100644 autotest/ogr/data/osm/test_json.osm create mode 100644 autotest/ogr/data/osm/test_json.pbf diff --git a/autotest/ogr/data/osm/test_json.osm b/autotest/ogr/data/osm/test_json.osm new file mode 100644 index 000000000000..f754e1295aa3 --- /dev/null +++ b/autotest/ogr/data/osm/test_json.osm @@ -0,0 +1,7 @@ + + + + + + + diff --git a/autotest/ogr/data/osm/test_json.pbf b/autotest/ogr/data/osm/test_json.pbf new file mode 100644 index 0000000000000000000000000000000000000000..4f94bbfe12aa62331bcfe658d215fda213d2a8f6 GIT binary patch literal 204 zcmV;-05ksp000dN2~Sf^NM&JUWpWr)5J(zOc%0*s;%KMAUU8~leoAU_6C-baac*X5uD+q3ArJuo zDTNx10000B3I|V9O+;aIVHj8tOd3>poa2(=V&LK|&d*JaFD*_j;$lwA&*$Q(P>)gK zB?v G(Gg#s8CI$Q literal 0 HcmV?d00001 diff --git a/autotest/ogr/ogr_osm.py b/autotest/ogr/ogr_osm.py index 41c84e298606..f50d66b04a68 100755 --- a/autotest/ogr/ogr_osm.py +++ b/autotest/ogr/ogr_osm.py @@ -911,3 +911,21 @@ def test_ogr_osm_tags_json(): assert lyr_defn.GetFieldDefn(other_tags_idx).GetSubType() == ogr.OFSTJSON f = lyr.GetNextFeature() assert f["other_tags"] == '{"foo":"bar"}' + + +############################################################################### +# Test TAGS_FORMAT=JSON + + +def test_ogr_osm_tags_json_special_characters(): + + ds = gdal.OpenEx("data/osm/test_json.pbf", open_options=["TAGS_FORMAT=JSON"]) + + lyr = ds.GetLayerByName("points") + lyr_defn = lyr.GetLayerDefn() + other_tags_idx = lyr_defn.GetFieldIndex("other_tags") + assert other_tags_idx >= 0 + assert lyr_defn.GetFieldDefn(other_tags_idx).GetType() == ogr.OFTString + assert lyr_defn.GetFieldDefn(other_tags_idx).GetSubType() == ogr.OFSTJSON + f = lyr.GetNextFeature() + assert f["other_tags"] == """{"foo":"x'\\\\\\"\\t\\n\\ry"}""" diff --git a/ogr/ogrsf_frmts/osm/ogr_osm.h b/ogr/ogrsf_frmts/osm/ogr_osm.h index 029271381631..c4ed2c414483 100644 --- a/ogr/ogrsf_frmts/osm/ogr_osm.h +++ b/ogr/ogrsf_frmts/osm/ogr_osm.h @@ -124,7 +124,7 @@ class OGROSMLayer final : public OGRLayer bool m_bHasWarnedTooManyFeatures = false; - char *m_pszAllTags = nullptr; + std::string m_osAllTagsBuffer{}; bool m_bHasWarnedAllTagsTruncated = false; bool m_bUserInterested = true; diff --git a/ogr/ogrsf_frmts/osm/ogrosmlayer.cpp b/ogr/ogrsf_frmts/osm/ogrosmlayer.cpp index f54bdddca306..e43e09e30a48 100644 --- a/ogr/ogrsf_frmts/osm/ogrosmlayer.cpp +++ b/ogr/ogrsf_frmts/osm/ogrosmlayer.cpp @@ -59,8 +59,6 @@ constexpr int SWITCH_THRESHOLD = 10000; constexpr int MAX_THRESHOLD = 100000; -constexpr int ALLTAGS_LENGTH = 8192; - /************************************************************************/ /* OGROSMLayer() */ /************************************************************************/ @@ -69,8 +67,7 @@ OGROSMLayer::OGROSMLayer(OGROSMDataSource *poDSIn, int nIdxLayerIn, const char *pszName) : m_poDS(poDSIn), m_nIdxLayer(nIdxLayerIn), m_poFeatureDefn(new OGRFeatureDefn(pszName)), - m_poSRS(new OGRSpatialReference()), - m_pszAllTags(static_cast(CPLMalloc(ALLTAGS_LENGTH))) + m_poSRS(new OGRSpatialReference()) { SetDescription(m_poFeatureDefn->GetName()); m_poFeatureDefn->Reference(); @@ -113,8 +110,6 @@ OGROSMLayer::~OGROSMLayer() sqlite3_finalize(m_oComputedAttributes[i].hStmt); } - CPLFree(m_pszAllTags); - CPLFree(m_papoFeatures); } @@ -520,25 +515,61 @@ int OGROSMLayer::AddInOtherOrAllTags(const char *pszK) } /************************************************************************/ -/* OGROSMEscapeString() */ +/* OGROSMEscapeStringHSTORE() */ /************************************************************************/ -static int OGROSMEscapeString(const char *pszV, char *pszAllTags) +static void OGROSMEscapeStringHSTORE(const char *pszV, std::string &sOut) { - int nAllTagsOff = 0; - - pszAllTags[nAllTagsOff++] = '"'; + sOut += '"'; for (int k = 0; pszV[k] != '\0'; k++) { if (pszV[k] == '"' || pszV[k] == '\\') - pszAllTags[nAllTagsOff++] = '\\'; - pszAllTags[nAllTagsOff++] = pszV[k]; + sOut += '\\'; + sOut += pszV[k]; } - pszAllTags[nAllTagsOff++] = '"'; + sOut += '"'; +} + +/************************************************************************/ +/* OGROSMEscapeStringJSON() */ +/************************************************************************/ + +static void OGROSMEscapeStringJSON(const char *pszV, std::string &sOut) +{ + sOut += '"'; - return nAllTagsOff; + for (int k = 0; pszV[k] != '\0'; k++) + { + const char ch = pszV[k]; + switch (ch) + { + case '"': + sOut += "\\\""; + break; + case '\\': + sOut += "\\\\"; + break; + case '\n': + sOut += "\\n"; + break; + case '\r': + sOut += "\\r"; + break; + case '\t': + sOut += "\\t"; + break; + default: + if (static_cast(ch) < ' ') + sOut += CPLSPrintf("\\u%04X", ch); + else + sOut += ch; + break; + } + } + + sOut += '"'; } /************************************************************************/ @@ -627,7 +658,7 @@ void OGROSMLayer::SetFieldsFromTags(OGRFeature *poFeature, GIntBig nID, poFeature->SetField("osm_changeset", (int)psInfo->nChangeset); } - int nAllTagsOff = 0; + m_osAllTagsBuffer.clear(); for (unsigned int j = 0; j < nTags; j++) { const char *pszK = pasTags[j].pszK; @@ -643,48 +674,27 @@ void OGROSMLayer::SetFieldsFromTags(OGRFeature *poFeature, GIntBig nID, { if (AddInOtherOrAllTags(pszK)) { - const int nLenK = static_cast(strlen(pszK)); - const int nLenV = static_cast(strlen(pszV)); - const int nLenKEscaped = 1 + 2 * nLenK + 1; - const int nLenVEscaped = 1 + 2 * nLenV + 1; - // 3 is either for - // - HSTORE: ',' separator and '=>' - // - JSON: leading '{' or ',', ':' and closing '}' - if (nAllTagsOff + nLenKEscaped + nLenVEscaped + 3 >= - ALLTAGS_LENGTH - 1) - { - if (!m_bHasWarnedAllTagsTruncated) - CPLDebug("OSM", - "all_tags/other_tags field truncated for " - "feature " CPL_FRMT_GIB, - nID); - m_bHasWarnedAllTagsTruncated = true; - continue; - } - if (m_poDS->m_bTagsAsHSTORE) { - if (nAllTagsOff) - m_pszAllTags[nAllTagsOff++] = ','; + if (!m_osAllTagsBuffer.empty()) + m_osAllTagsBuffer += ','; - nAllTagsOff += - OGROSMEscapeString(pszK, m_pszAllTags + nAllTagsOff); + OGROSMEscapeStringHSTORE(pszK, m_osAllTagsBuffer); - m_pszAllTags[nAllTagsOff++] = '='; - m_pszAllTags[nAllTagsOff++] = '>'; + m_osAllTagsBuffer += '='; + m_osAllTagsBuffer += '>'; - nAllTagsOff += - OGROSMEscapeString(pszV, m_pszAllTags + nAllTagsOff); + OGROSMEscapeStringHSTORE(pszV, m_osAllTagsBuffer); } else { - m_pszAllTags[nAllTagsOff] = nAllTagsOff ? ',' : '{'; - nAllTagsOff++; - nAllTagsOff += - OGROSMEscapeString(pszK, m_pszAllTags + nAllTagsOff); - m_pszAllTags[nAllTagsOff++] = ':'; - nAllTagsOff += - OGROSMEscapeString(pszV, m_pszAllTags + nAllTagsOff); + if (!m_osAllTagsBuffer.empty()) + m_osAllTagsBuffer += ','; + else + m_osAllTagsBuffer = '{'; + OGROSMEscapeStringJSON(pszK, m_osAllTagsBuffer); + m_osAllTagsBuffer += ':'; + OGROSMEscapeStringJSON(pszV, m_osAllTagsBuffer); } } @@ -698,17 +708,16 @@ void OGROSMLayer::SetFieldsFromTags(OGRFeature *poFeature, GIntBig nID, } } - if (nAllTagsOff) + if (!m_osAllTagsBuffer.empty()) { if (!m_poDS->m_bTagsAsHSTORE) { - m_pszAllTags[nAllTagsOff++] = '}'; + m_osAllTagsBuffer += '}'; } - m_pszAllTags[nAllTagsOff] = '\0'; if (m_nIndexAllTags >= 0) - poFeature->SetField(m_nIndexAllTags, m_pszAllTags); + poFeature->SetField(m_nIndexAllTags, m_osAllTagsBuffer.c_str()); else - poFeature->SetField(m_nIndexOtherTags, m_pszAllTags); + poFeature->SetField(m_nIndexOtherTags, m_osAllTagsBuffer.c_str()); } for (size_t i = 0; i < m_oComputedAttributes.size(); i++)