From 0290b3c3893e6bb543f35819c1ed76abf66aaaba Mon Sep 17 00:00:00 2001 From: ErnestaP Date: Wed, 2 Aug 2023 16:30:23 +0200 Subject: [PATCH] mapping: springer parsing --- springer_fields_mapping.md | 83 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 springer_fields_mapping.md diff --git a/springer_fields_mapping.md b/springer_fields_mapping.md new file mode 100644 index 00000000..b61b8746 --- /dev/null +++ b/springer_fields_mapping.md @@ -0,0 +1,83 @@ + + + +# Parsing + +| Reference | Field | Source | | Subfields | Parsing | | | | | | +|-----------|------------------|------------------------------------------------------------------------------------------------------|-----------------|----------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---|---|---|---|---| +| | | tags path | atribute | | | | | | | | +| [1] | dois | ./Journal/Volume/Issue/Article/ArticleInfo/ArticleDOI | | | | | | | | | +| [2] | journal_doctype | ./Journal/Volume/Issue/Article/ArticleInfo/ArticleDOI | journal_doctype | | lambda x: article_type_mapping[x] \n {"OriginalPaper": "article","ReviewPaper": "review","BriefCommunication": "article","EditorialNotes": "editorial","BookReview": "review","ContinuingEducation": "other","Interview": "other","Letter": "other","Erratum": "erratum","Legacy": "other","Abstract":"other","Report": "other","Announcement": "other","News": "other","Events": "other","Acknowledgments": "other","MediaReport": "other","BibliographicalNote": "other","ProductNotes": "other","Unknown": "other"} | | | | | | +| [3] | arxiv_eprints | ./Journal/Volume/Issue/Article/ArticleInfo/ArticleExternalID[@Type='arXiv' | | value [22] | | | | | | | +| [4] | page_nr | | | | page_nr ; final value [25] | | | | | | +| [5] | abstract | | | | _get_abstract | | | | | | +| [6] | title | ./Journal/Volume/Issue/Article/ArticleInfo/ArticleTitle | | | | | | | | | +| [7] | authors | | | orcid [27] surname [28] given_names [29] email [30] affiliations [31] | _get_authors | | | | | | +| [8] | collaborations | ./Journal/Volume/Issue/Article/ArticleHeader/AuthorGroup/InstitutionalAuthor/InstitutionalAuthorName | | | ```lambda x: [x]``` | | | | | | +| [9] | journal_title | ./Journal/JournalInfo/JournalTitle | | | ```lambda s: s.lstrip("The ")``` | | | | | | +| [10] | journal_issue | ./Journal/Volume/Issue/IssueInfo/IssueIDStart | | | | | | | | | +| [11] | journal_volume | ./Journal/Volume/VolumeInfo/VolumeIDStart | | | | | | | | | +| [12] | journal_artid | ./Journal/Volume/Issue/Article | ID | | | | | | | | +| [13] | journal_fpage | ./Journal/Volume/Issue/Article/ArticleInfo/ArticleFirstPage | | | | | | | | | +| [14] | journal_lpage | ./Journal/Volume/Issue/Article/ArticleInfo/ArticleLastPage | | | | | | | | | +| [15] | journal_year | ./Journal/Volume/Issue/Article/ArticleInfo/*/OnlineDate/Year | | | | | | | | | +| [16] | date_published | | | | _get_published_date ; final value [34] | | | | | | +| [17] | copyright_holder | ./Journal/Volume/Issue/Article/ArticleInfo/ArticleCopyright/CopyrightHolderName | | | | | | | | | +| [18] | copyright_year | ./Journal/Volume/Issue/Article/ArticleInfo/ArticleCopyright/CopyrightYear | | | ```lambda x: int(x)``` | | | | | | +| [19] | copyright_statement | ./Journal/Volume/Issue/Article/ArticleInfo/ArticleCopyright/copyright-statement | | | | +| [20] | license | ./Journal/JournalInfo/JournalTitle | | url [38]
license [39] | _get_license | +| [21] | collections | | | | ```lambda x: [x.lstrip("The ")]``` | + + + +### [_get_arxiv_eprints](#_get_arxiv_eprints) + +| Reference | Method | Tags path | Processing | +|-----------|---------|----------------------------------------------------------------------------|---------------------------------------------------------------------| +| [22](#22) | findall | ./Journal/Volume/Issue/Article/ArticleInfo/ArticleExternalID[@Type='arXiv' | Put all found arxiv values in array of dicst: {"value": arxiv.text} | + + +### [page_nr](#page_nr) + +| Reference | Value | Method | Tags path | Processing | Default value | +|-----------|-----------------|--------|-------------------------------------------------------------|------------------------------------|---------------| +| [23] | first_page_node | find | ./Journal/Volume/Issue/Article/ArticleInfo/ArticleFirstPage | | | +| [24] | last_page_node | find | ./Journal/Volume/Issue/Article/ArticleInfo/ArticleLastPage | | | +| [25](#25) | FINAL VALUE | | | [last_page_node - first_page_node] | [] | + + +### [get_abstract](#get_abstract) +| Reference | Method | Path | Processing | +|-----------|--------|------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| [26] | find | ./Journal/Volume/Issue/Article/ArticleHeader/Abstract/Para | Filter values from paragraph: takes just those which don't have tag"EquationSource" or attribute["Format"]== "TEX". Later joins all filtered values to a single string and removes >1 blank spaces. | + +### [get_authors](#get_authors) + +| Reference | Field | Tag path | Attribute | +|-----------|--------------|-----------------------------------------------------------|-----------| +| [27](#27) | orcid | . | ORCID | +| [28](#28) | surname | ./AuthorName/FamilyName | | +| [29](#29) | given_names | ./AuthorName/GivenName | | +| [30](#30) | email | ./Contact/Email | | +| [31](#31) | affiliations | ./Journal/Volume/Issue/Article/ArticleHeader/AuthorGroup | | + + +### [_get_published_date](#_get_published_date) +| Reference | Value | Method | Tag path | Processing | +|-----------|-------------|--------|---------------------------------------------------------------|----------------------------------------------------------------------------------| +| [31] | year | find | ./Journal/Volume/Issue/Article/ArticleInfo/*/OnlineDate/Year | | +| [32] | month | find | ./Journal/Volume/Issue/Article/ArticleInfo/*/OnlineDate/Month | | +| [33] | day | find | ./Journal/Volume/Issue/Article/ArticleInfo/*/OnlineDate/Day | | +| [34](#34) | FINAL VALUE | | | ``` datetime.date(day=int(day), month=int(month), year=int(year)).isoformat()``` | + +### [_get_license](#_get_license) + + +| Reference | Field | Value | Method | Tag path | Attribute | Default value | Processing | +|-----------|---------|--------------|--------|---------------------------------------------------------------------|-----------|-----------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| [35] | | license_type | find | ./Journal/Volume/Issue/Article/ArticleInfo/ArticleCopyright/License | SubType | | ```license_type_parts = license_type.split(" ")``` ```license_type = "-".join(license_type_parts)``` | +| [36] | | version | find | ./Journal/Volume/Issue/Article/ArticleInfo/ArticleCopyright/License | Version | | | +| [37] | | base_url | | | | https://creativecommons.org/licenses | | +| [38](#38) | url | | | | | https://creativecommons.org/licenses/by/3.0 | ```(f"{base_url}/{license_type_parts[1].lower()}/{version}") or ( url and license_type and version: {"url": url, "license": f"{license_type}-{version}"}) or (license_type and version: {"license": f"{license_type}-{version}"} )``` | +| [39](#39) | license | license_type | | | | CC-BY-3.0 | | +| | | | | | | | |