From 12ee4a792c41fbe79ec4a439aafda21a7d679946 Mon Sep 17 00:00:00 2001 From: "Maarten L. Hekkelman" Date: Mon, 29 Jan 2024 13:00:50 +0100 Subject: [PATCH] pdb2cif work --- .gitignore | 3 +- src/pdb/pdb2cif.cpp | 117 ++++++++++++++--------------------- src/pdb/reconstruct.cpp | 19 ++++-- test/reconstruction-test.cpp | 22 +++++-- 4 files changed, 80 insertions(+), 81 deletions(-) diff --git a/.gitignore b/.gitignore index df8946d..05e3702 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,5 @@ Testing/ include/cif++/exports.hpp docs/api docs/conf.py -build_ci/ \ No newline at end of file +build_ci/ +data/components.cif diff --git a/src/pdb/pdb2cif.cpp b/src/pdb/pdb2cif.cpp index 942a9a4..93dc86e 100644 --- a/src/pdb/pdb2cif.cpp +++ b/src/pdb/pdb2cif.cpp @@ -1123,9 +1123,6 @@ void PDBFileParser::PreParseInput(std::istream &is) if (lookahead.back() == '\r') lookahead.pop_back(); - // if (cif::starts_with(lookahead, "HEADER") == false) - // throw std::runtime_error("This does not look like a PDB file, should start with a HEADER line"); - auto contNr = [&lookahead](int offset, int len) -> int { std::string cs = lookahead.substr(offset, len); @@ -1558,52 +1555,54 @@ void PDBFileParser::ParseTitle() // 11 - 80 Specification compound Description of the molecular components. // list - std::string value{ mRec->vS(11) }; - if (value.find(':') == std::string::npos) - { - // special case for dumb, stripped files - auto &comp = GetOrCreateCompound(1); - comp.mInfo["MOLECULE"] = value; - } - else + if (mRec->is("COMPND")) { - SpecificationListParser p(value); - - for (;;) + std::string value{ mRec->vS(11) }; + if (value.find(':') == std::string::npos) { - std::string key, val; - std::tie(key, val) = p.GetNextSpecification(); - - if (key.empty()) - break; + // special case for dumb, stripped files + auto &comp = GetOrCreateCompound(1); + comp.mInfo["MOLECULE"] = value; + } + else + { + SpecificationListParser p(value); - if (not iequals(key, "MOL_ID") and mCompounds.empty()) + for (;;) { - if (cif::VERBOSE > 0) - std::cerr << "Ignoring invalid COMPND record\n"; - break; - } + std::string key, val; + std::tie(key, val) = p.GetNextSpecification(); - if (key == "MOL_ID") - { - auto &comp = GetOrCreateCompound(stoi(val)); - comp.mTitle = title; - } - else if (key == "CHAIN") - { - for (auto c : cif::split(val, ",")) + if (key.empty()) + break; + + if (not iequals(key, "MOL_ID") and mCompounds.empty()) { - cif::trim(c); - mCompounds.back().mChains.insert(c[0]); + if (cif::VERBOSE > 0) + std::cerr << "Ignoring invalid COMPND record\n"; + break; } + + if (key == "MOL_ID") + { + auto &comp = GetOrCreateCompound(stoi(val)); + comp.mTitle = title; + } + else if (key == "CHAIN") + { + for (auto c : cif::split(val, ",")) + { + cif::trim(c); + mCompounds.back().mChains.insert(c[0]); + } + } + else + mCompounds.back().mInfo[key] = val; } - else - mCompounds.back().mInfo[key] = val; } - } - if (mRec->is("COMPND")) GetNextRecord(); + } // SOURCE Match("SOURCE", false); @@ -1740,7 +1739,7 @@ void PDBFileParser::ParseTitle() int n = 1; cat = getCategory("audit_author"); - value = { mRec->vS(11) }; + std::string value = { mRec->vS(11) }; for (auto author : cif::split(value, ",", true)) { // clang-format off @@ -4556,7 +4555,7 @@ void PDBFileParser::ConstructEntities() std::string formula; std::string type; std::string nstd = "."; - std::string formulaWeight; + std::optional formulaWeight; if (compound != nullptr) { @@ -4567,7 +4566,7 @@ void PDBFileParser::ConstructEntities() nstd = "y"; formula = compound->formula(); - formulaWeight = std::to_string(compound->formula_weight()); + formulaWeight = compound->formula_weight(); } if (name.empty()) @@ -4594,7 +4593,7 @@ void PDBFileParser::ConstructEntities() { "id", cc }, { "name", name }, { "formula", formula }, - { "formula_weight", formulaWeight }, + { "formula_weight", formulaWeight, 3 }, { "mon_nstd_flag", nstd }, { "type", type } }); @@ -4709,7 +4708,7 @@ void PDBFileParser::ConstructEntities() } if (formula_weight > 0) - entity["formula_weight"] = formula_weight; + entity.assign({ { "formula_weight", formula_weight, 3 } }); } } @@ -5578,31 +5577,6 @@ void PDBFileParser::ParseCrystallographic() GetNextRecord(); } - else - { - // clang-format off - - // no cryst1, make a simple one, like this: - // CRYST1 1.000 1.000 1.000 90.00 90.00 90.00 P 1 1 - getCategory("cell")->emplace({ - { "entry_id", mStructureID }, // 1 - 6 Record name "CRYST1" - { "length_a", 1 }, // 7 - 15 Real(9.3) a a (Angstroms). - { "length_b", 1 }, // 16 - 24 Real(9.3) b b (Angstroms). - { "length_c", 1 }, // 25 - 33 Real(9.3) c c (Angstroms). - { "angle_alpha", 90 }, // 34 - 40 Real(7.2) alpha alpha (degrees). - { "angle_beta", 90 }, // 41 - 47 Real(7.2) beta beta (degrees). - { "angle_gamma", 90 }, // 48 - 54 Real(7.2) gamma gamma (degrees). - /* goes into symmetry */ // 56 - 66 LString sGroup Space group. - { "Z_PDB", 1 } // 67 - 70 Integer z Z value. - }); - - getCategory("symmetry")->emplace({ - { "entry_id", mStructureID }, - { "space_group_name_H-M", "P 1" }, - { "Int_Tables_number", 1 } - }); - // clang-format on - } } void PDBFileParser::ParseCoordinateTransformation() @@ -6463,7 +6437,12 @@ file read(std::istream &is) // and so the very first character in a valid PDB file // is 'H'. It is as simple as that. - if (ch == 'h' or ch == 'H') + // Well, not quite, Unfortunately... People insisted that + // having only ATOM records also makes up a valid PDB file... + // Since mmCIF files cannot validly start with a letter character + // the test has changed into the following: + + if (std::isalpha(ch)) read_pdb_file(is, result); else { diff --git a/src/pdb/reconstruct.cpp b/src/pdb/reconstruct.cpp index 01b3143..9ff1640 100644 --- a/src/pdb/reconstruct.cpp +++ b/src/pdb/reconstruct.cpp @@ -491,12 +491,13 @@ void checkAtomAnisotropRecords(datablock &db) auto &atom_site = db["atom_site"]; auto &atom_site_anisotrop = db["atom_site_anisotrop"]; - auto m_validator = db.get_validator(); - if (not m_validator) - return; + // auto m_validator = db.get_validator(); + // if (not m_validator) + // return; std::vector to_be_deleted; + bool warnReplaceTypeSymbol = true; for (auto row : atom_site_anisotrop) { auto parents = atom_site_anisotrop.get_parents(row, atom_site); @@ -512,6 +513,12 @@ void checkAtomAnisotropRecords(datablock &db) if (row["type_symbol"].empty()) row["type_symbol"] = parent["type_symbol"].text(); + else if (row["type_symbol"].text() != parent["type_symbol"].text()) + { + if (cif::VERBOSE and std::exchange(warnReplaceTypeSymbol, false)) + std::clog << "Replacing type_symbol in atom_site_anisotrop record(s)\n"; + row["type_symbol"] != parent["type_symbol"].text(); + } if (row["pdbx_auth_alt_id"].empty()) row["pdbx_auth_alt_id"] = parent["pdbx_auth_alt_id"].text(); @@ -1019,9 +1026,6 @@ bool reconstruct_pdbx(file &file, std::string_view dictionary) // Now see if atom records make sense at all checkAtomRecords(db); - if (db.get("atom_site_anisotrop")) - checkAtomAnisotropRecords(db); - std::vector invalidCategories; // clean up each category @@ -1244,6 +1248,9 @@ bool reconstruct_pdbx(file &file, std::string_view dictionary) file.load_dictionary(dictionary); + if (db.get("atom_site_anisotrop")) + checkAtomAnisotropRecords(db); + // Now create any missing categories // Next make sure we have struct_asym records if (db.get("struct_asym") == nullptr) diff --git a/test/reconstruction-test.cpp b/test/reconstruction-test.cpp index 6c2bf76..c38623f 100644 --- a/test/reconstruction-test.cpp +++ b/test/reconstruction-test.cpp @@ -41,12 +41,24 @@ TEST_CASE("reconstruct") { std::cout << i->path() << '\n'; - cif::file f(i->path()); + if (i->path().extension() == ".pdb") + { + cif::file f = cif::pdb::read(i->path()); - std::error_code ec; - CHECK_FALSE(cif::pdb::is_valid_pdbx_file(f, ec)); - CHECK(ec != std::errc{}); + std::error_code ec; - CHECK(cif::pdb::reconstruct_pdbx(f)); + if (not cif::pdb::is_valid_pdbx_file(f, ec)) + CHECK(cif::pdb::reconstruct_pdbx(f)); + } + else + { + cif::file f(i->path()); + + std::error_code ec; + CHECK_FALSE(cif::pdb::is_valid_pdbx_file(f, ec)); + CHECK(ec != std::errc{}); + + CHECK(cif::pdb::reconstruct_pdbx(f)); + } } } \ No newline at end of file