From eb7306197bff5fe8db65b8d3411837bb9857bab1 Mon Sep 17 00:00:00 2001 From: DmitryRyumin Date: Sun, 21 Jan 2024 19:39:12 +0000 Subject: [PATCH] Copy Parse Markdown and Generate JSON from Source Repo --- code/markdown_to_json_parser.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/code/markdown_to_json_parser.py b/code/markdown_to_json_parser.py index b6b7941..20671a3 100644 --- a/code/markdown_to_json_parser.py +++ b/code/markdown_to_json_parser.py @@ -359,6 +359,17 @@ def extract_paper_data(paper_section, columns): title = re.sub(r"<(?:br\s*/?>|img[^>]*>)", "", title) title = title.strip() + html_entities = { + "&": "&", + "<": "<", + ">": ">", + """: '"', + "'": "'", + } + title = re.sub( + r"(&\w+;)", lambda x: html_entities.get(x.group(0), x.group(0)), title + ) + title_link = title_column.find("a") title_page = title_link["href"] if title_link else None