From 8b18e9d467f94a263b89b38d2078f862b1d5ab04 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Wed, 4 Oct 2023 00:27:21 +0200 Subject: [PATCH] Dataset card template overhaul --- .../templates/datasetcard_template.md | 109 +++++++++++------- .../templates/modelcard_template.md | 4 +- tests/test_repocard.py | 10 +- 3 files changed, 74 insertions(+), 49 deletions(-) diff --git a/src/huggingface_hub/templates/datasetcard_template.md b/src/huggingface_hub/templates/datasetcard_template.md index 6d9281f9d3..aea795c904 100644 --- a/src/huggingface_hub/templates/datasetcard_template.md +++ b/src/huggingface_hub/templates/datasetcard_template.md @@ -1,103 +1,130 @@ --- -# For reference on model card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1 +# For reference on dataset card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1 # Doc / guide: https://huggingface.co/docs/hub/datasets-cards {{ card_data }} --- # Dataset Card for {{ pretty_name | default("Dataset Name", true) }} -## Dataset Description + -- **Homepage:** {{ homepage_url | default("", true)}} -- **Repository:** {{ repo_url | default("", true)}} -- **Paper:** {{ paper_url | default("", true)}} -- **Leaderboard:** {{ leaderboard_url | default("", true)}} -- **Point of Contact:** {{ point_of_contact | default("", true)}} +{{ dataset_summary | default("", true) }} -### Dataset Summary +## Dataset Details -{{ dataset_summary | default("[More Information Needed]", true)}} +### Dataset Description -### Supported Tasks and Leaderboards + -{{ supported_tasks_and_leaderboards_section | default("[More Information Needed]", true)}} +{{ dataset_description | default("", true) }} -### Languages +- **Curated by:** {{ curators | default("[More Information Needed]", true)}} +- **Shared by [optional]:** {{ shared_by | default("[More Information Needed]", true)}} +- **Language(s) (NLP):** {{ language | default("[More Information Needed]", true)}} +- **License:** {{ license | default("[More Information Needed]", true)}} -{{ languages_section | default("[More Information Needed]", true)}} +### Dataset Sources [optional] -## Dataset Structure + + +- **Repository:** {{ repo | default("[More Information Needed]", true)}} +- **Paper [optional]:** {{ paper | default("[More Information Needed]", true)}} +- **Demo [optional]:** {{ demo | default("[More Information Needed]", true)}} + +## Uses + + -### Data Instances +### Out-of-Scope Use -{{ data_instances_section | default("[More Information Needed]", true)}} + -### Data Fields +{{ out_of_scope_use | default("[More Information Needed]", true)}} -{{ data_fields_section | default("[More Information Needed]", true)}} +## Dataset Structure -### Data Splits + -{{ data_splits_section | default("[More Information Needed]", true)}} +{{ dataset_structure | default("[More Information Needed]", true)}} ## Dataset Creation ### Curation Rationale + + {{ curation_rationale_section | default("[More Information Needed]", true)}} ### Source Data + + #### Initial Data Collection and Normalization + + {{ data_collection_section | default("[More Information Needed]", true)}} -#### Who are the source language producers? +#### Who are the source data producers? -{{ source_language_producers_section | default("[More Information Needed]", true)}} + -### Annotations +{{ source_data_producers_section | default("[More Information Needed]", true)}} + +### Annotations [optional] + + #### Annotation process + + {{ annotation_process_section | default("[More Information Needed]", true)}} #### Who are the annotators? + + {{ who_are_annotators_section | default("[More Information Needed]", true)}} -### Personal and Sensitive Information +## Bias, Risks, and Limitations + + + +{{ bias_risks_limitations | default("[More Information Needed]", true)}} + +### Recommendations -{{ personal_and_sensitive_information_section | default("[More Information Needed]", true)}} + -## Considerations for Using the Data +{{ bias_recommendations | default("Users should be made aware of the risks, biases and limitations of the dataset. More information needed for further recommendations.", true)}} -### Social Impact of Dataset +## Citation [optional] -{{ social_impact_section | default("[More Information Needed]", true)}} + -### Discussion of Biases +**BibTeX:** -{{ discussion_of_biases_section | default("[More Information Needed]", true)}} +{{ citation_bibtex | default("[More Information Needed]", true)}} -### Other Known Limitations +**APA:** -{{ known_limitations_section | default("[More Information Needed]", true)}} +{{ citation_apa | default("[More Information Needed]", true)}} -## Additional Information +## Glossary [optional] -### Dataset Curators + -{{ dataset_curators_section | default("[More Information Needed]", true)}} +{{ glossary | default("[More Information Needed]", true)}} -### Licensing Information +## More Information [optional] -{{ licensing_information_section | default("[More Information Needed]", true)}} +{{ more_information | default("[More Information Needed]", true)}} -### Citation Information +## Dataset Card Authors [optional] -{{ citation_information_section | default("[More Information Needed]", true)}} +{{ dataset_card_authors | default("[More Information Needed]", true)}} -### Contributions +## Dataset Card Contact -{{ contributions_section | default("[More Information Needed]", true)}} +{{ dataset_card_contact | default("[More Information Needed]", true)}} \ No newline at end of file diff --git a/src/huggingface_hub/templates/modelcard_template.md b/src/huggingface_hub/templates/modelcard_template.md index ec2d18d427..6ad48df1c7 100644 --- a/src/huggingface_hub/templates/modelcard_template.md +++ b/src/huggingface_hub/templates/modelcard_template.md @@ -77,7 +77,7 @@ Use the code below to get started with the model. ### Training Data - + {{ training_data | default("[More Information Needed]", true)}} @@ -108,7 +108,7 @@ Use the code below to get started with the model. #### Testing Data - + {{ testing_data | default("[More Information Needed]", true)}} diff --git a/tests/test_repocard.py b/tests/test_repocard.py index a4255012b2..9c95fc41d5 100644 --- a/tests/test_repocard.py +++ b/tests/test_repocard.py @@ -891,10 +891,8 @@ def test_dataset_card_from_default_template_with_template_variables(self): # Here we pass the card data as kwargs as well so template picks up pretty_name. card = DatasetCard.from_template( card_data, - homepage_url="https://huggingface.co", - repo_url="https://github.com/huggingface/huggingface_hub", - paper_url="https://arxiv.org/pdf/1910.03771.pdf", - point_of_contact="https://huggingface.co/nateraw", + repo="https://github.com/huggingface/huggingface_hub", + paper="https://arxiv.org/pdf/1910.03771.pdf", dataset_summary=( "This is a test dataset card to check if the template variables " "in the dataset card template are working." @@ -903,8 +901,8 @@ def test_dataset_card_from_default_template_with_template_variables(self): self.assertTrue(card.text.strip().startswith("# Dataset Card for My Cool Dataset")) self.assertIsInstance(card, DatasetCard) - matches = re.findall(r"Homepage:\*\* https:\/\/huggingface\.co", str(card)) - self.assertEqual(matches[0], "Homepage:** https://huggingface.co") + matches = re.findall(r"Repository:\*\* https://github\.com/huggingface/huggingface_hub", str(card)) + self.assertEqual(matches[0], "Repository:** https://github.com/huggingface/huggingface_hub") @require_jinja def test_dataset_card_from_custom_template(self):