From 8b18e9d467f94a263b89b38d2078f862b1d5ab04 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Wed, 4 Oct 2023 00:27:21 +0200 Subject: [PATCH 1/4] Dataset card template overhaul --- .../templates/datasetcard_template.md | 109 +++++++++++------- .../templates/modelcard_template.md | 4 +- tests/test_repocard.py | 10 +- 3 files changed, 74 insertions(+), 49 deletions(-) diff --git a/src/huggingface_hub/templates/datasetcard_template.md b/src/huggingface_hub/templates/datasetcard_template.md index 6d9281f9d3..aea795c904 100644 --- a/src/huggingface_hub/templates/datasetcard_template.md +++ b/src/huggingface_hub/templates/datasetcard_template.md @@ -1,103 +1,130 @@ --- -# For reference on model card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1 +# For reference on dataset card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1 # Doc / guide: https://huggingface.co/docs/hub/datasets-cards {{ card_data }} --- # Dataset Card for {{ pretty_name | default("Dataset Name", true) }} -## Dataset Description + -- **Homepage:** {{ homepage_url | default("", true)}} -- **Repository:** {{ repo_url | default("", true)}} -- **Paper:** {{ paper_url | default("", true)}} -- **Leaderboard:** {{ leaderboard_url | default("", true)}} -- **Point of Contact:** {{ point_of_contact | default("", true)}} +{{ dataset_summary | default("", true) }} -### Dataset Summary +## Dataset Details -{{ dataset_summary | default("[More Information Needed]", true)}} +### Dataset Description -### Supported Tasks and Leaderboards + -{{ supported_tasks_and_leaderboards_section | default("[More Information Needed]", true)}} +{{ dataset_description | default("", true) }} -### Languages +- **Curated by:** {{ curators | default("[More Information Needed]", true)}} +- **Shared by [optional]:** {{ shared_by | default("[More Information Needed]", true)}} +- **Language(s) (NLP):** {{ language | default("[More Information Needed]", true)}} +- **License:** {{ license | default("[More Information Needed]", true)}} -{{ languages_section | default("[More Information Needed]", true)}} +### Dataset Sources [optional] -## Dataset Structure + + +- **Repository:** {{ repo | default("[More Information Needed]", true)}} +- **Paper [optional]:** {{ paper | default("[More Information Needed]", true)}} +- **Demo [optional]:** {{ demo | default("[More Information Needed]", true)}} + +## Uses + + -### Data Instances +### Out-of-Scope Use -{{ data_instances_section | default("[More Information Needed]", true)}} + -### Data Fields +{{ out_of_scope_use | default("[More Information Needed]", true)}} -{{ data_fields_section | default("[More Information Needed]", true)}} +## Dataset Structure -### Data Splits + -{{ data_splits_section | default("[More Information Needed]", true)}} +{{ dataset_structure | default("[More Information Needed]", true)}} ## Dataset Creation ### Curation Rationale + + {{ curation_rationale_section | default("[More Information Needed]", true)}} ### Source Data + + #### Initial Data Collection and Normalization + + {{ data_collection_section | default("[More Information Needed]", true)}} -#### Who are the source language producers? +#### Who are the source data producers? -{{ source_language_producers_section | default("[More Information Needed]", true)}} + -### Annotations +{{ source_data_producers_section | default("[More Information Needed]", true)}} + +### Annotations [optional] + + #### Annotation process + + {{ annotation_process_section | default("[More Information Needed]", true)}} #### Who are the annotators? + + {{ who_are_annotators_section | default("[More Information Needed]", true)}} -### Personal and Sensitive Information +## Bias, Risks, and Limitations + + + +{{ bias_risks_limitations | default("[More Information Needed]", true)}} + +### Recommendations -{{ personal_and_sensitive_information_section | default("[More Information Needed]", true)}} + -## Considerations for Using the Data +{{ bias_recommendations | default("Users should be made aware of the risks, biases and limitations of the dataset. More information needed for further recommendations.", true)}} -### Social Impact of Dataset +## Citation [optional] -{{ social_impact_section | default("[More Information Needed]", true)}} + -### Discussion of Biases +**BibTeX:** -{{ discussion_of_biases_section | default("[More Information Needed]", true)}} +{{ citation_bibtex | default("[More Information Needed]", true)}} -### Other Known Limitations +**APA:** -{{ known_limitations_section | default("[More Information Needed]", true)}} +{{ citation_apa | default("[More Information Needed]", true)}} -## Additional Information +## Glossary [optional] -### Dataset Curators + -{{ dataset_curators_section | default("[More Information Needed]", true)}} +{{ glossary | default("[More Information Needed]", true)}} -### Licensing Information +## More Information [optional] -{{ licensing_information_section | default("[More Information Needed]", true)}} +{{ more_information | default("[More Information Needed]", true)}} -### Citation Information +## Dataset Card Authors [optional] -{{ citation_information_section | default("[More Information Needed]", true)}} +{{ dataset_card_authors | default("[More Information Needed]", true)}} -### Contributions +## Dataset Card Contact -{{ contributions_section | default("[More Information Needed]", true)}} +{{ dataset_card_contact | default("[More Information Needed]", true)}} \ No newline at end of file diff --git a/src/huggingface_hub/templates/modelcard_template.md b/src/huggingface_hub/templates/modelcard_template.md index ec2d18d427..6ad48df1c7 100644 --- a/src/huggingface_hub/templates/modelcard_template.md +++ b/src/huggingface_hub/templates/modelcard_template.md @@ -77,7 +77,7 @@ Use the code below to get started with the model. ### Training Data - + {{ training_data | default("[More Information Needed]", true)}} @@ -108,7 +108,7 @@ Use the code below to get started with the model. #### Testing Data - + {{ testing_data | default("[More Information Needed]", true)}} diff --git a/tests/test_repocard.py b/tests/test_repocard.py index a4255012b2..9c95fc41d5 100644 --- a/tests/test_repocard.py +++ b/tests/test_repocard.py @@ -891,10 +891,8 @@ def test_dataset_card_from_default_template_with_template_variables(self): # Here we pass the card data as kwargs as well so template picks up pretty_name. card = DatasetCard.from_template( card_data, - homepage_url="https://huggingface.co", - repo_url="https://github.com/huggingface/huggingface_hub", - paper_url="https://arxiv.org/pdf/1910.03771.pdf", - point_of_contact="https://huggingface.co/nateraw", + repo="https://github.com/huggingface/huggingface_hub", + paper="https://arxiv.org/pdf/1910.03771.pdf", dataset_summary=( "This is a test dataset card to check if the template variables " "in the dataset card template are working." @@ -903,8 +901,8 @@ def test_dataset_card_from_default_template_with_template_variables(self): self.assertTrue(card.text.strip().startswith("# Dataset Card for My Cool Dataset")) self.assertIsInstance(card, DatasetCard) - matches = re.findall(r"Homepage:\*\* https:\/\/huggingface\.co", str(card)) - self.assertEqual(matches[0], "Homepage:** https://huggingface.co") + matches = re.findall(r"Repository:\*\* https://github\.com/huggingface/huggingface_hub", str(card)) + self.assertEqual(matches[0], "Repository:** https://github.com/huggingface/huggingface_hub") @require_jinja def test_dataset_card_from_custom_template(self): From d191c86803f722b05714962891aeeb132a9cdec7 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Wed, 4 Oct 2023 19:04:21 +0200 Subject: [PATCH 2/4] Add Privacy Considerations subsection --- src/huggingface_hub/templates/datasetcard_template.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/huggingface_hub/templates/datasetcard_template.md b/src/huggingface_hub/templates/datasetcard_template.md index aea795c904..73b9f153c6 100644 --- a/src/huggingface_hub/templates/datasetcard_template.md +++ b/src/huggingface_hub/templates/datasetcard_template.md @@ -93,6 +93,12 @@ {{ bias_risks_limitations | default("[More Information Needed]", true)}} +### Privacy Considerations + + + +{{ privacy_considerations | default("[More Information Needed]", true)}} + ### Recommendations From 131b5135a9d41e0e7660d03d43ce64e551342c76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20=C5=A0a=C5=A1ko?= Date: Wed, 4 Oct 2023 21:34:52 +0200 Subject: [PATCH 3/4] Apply suggestions from code review Co-authored-by: meg <90473723+meg-huggingface@users.noreply.github.com> --- src/huggingface_hub/templates/datasetcard_template.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/huggingface_hub/templates/datasetcard_template.md b/src/huggingface_hub/templates/datasetcard_template.md index 73b9f153c6..4fb6ae4284 100644 --- a/src/huggingface_hub/templates/datasetcard_template.md +++ b/src/huggingface_hub/templates/datasetcard_template.md @@ -43,7 +43,7 @@ ## Dataset Structure - + {{ dataset_structure | default("[More Information Needed]", true)}} @@ -73,7 +73,7 @@ ### Annotations [optional] - + #### Annotation process @@ -95,7 +95,7 @@ ### Privacy Considerations - + {{ privacy_considerations | default("[More Information Needed]", true)}} From 51a3bb1fe86aa83acb125df5c5e063c71955eda2 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Wed, 4 Oct 2023 23:42:59 +0200 Subject: [PATCH 4/4] Address more comments --- .../templates/datasetcard_template.md | 25 ++++++++++++------- .../templates/modelcard_template.md | 1 + 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/src/huggingface_hub/templates/datasetcard_template.md b/src/huggingface_hub/templates/datasetcard_template.md index 73b9f153c6..97496faaa7 100644 --- a/src/huggingface_hub/templates/datasetcard_template.md +++ b/src/huggingface_hub/templates/datasetcard_template.md @@ -19,6 +19,7 @@ {{ dataset_description | default("", true) }} - **Curated by:** {{ curators | default("[More Information Needed]", true)}} +- **Funded by [optional]:** {{ funded_by | default("[More Information Needed]", true)}} - **Shared by [optional]:** {{ shared_by | default("[More Information Needed]", true)}} - **Language(s) (NLP):** {{ language | default("[More Information Needed]", true)}} - **License:** {{ license | default("[More Information Needed]", true)}} @@ -35,6 +36,12 @@ +### Direct Use + + + +{{ direct_use | default("[More Information Needed]", true)}} + ### Out-of-Scope Use @@ -59,11 +66,11 @@ -#### Initial Data Collection and Normalization +#### Data Collection and Processing - + -{{ data_collection_section | default("[More Information Needed]", true)}} +{{ data_collection_and_processing_section | default("[More Information Needed]", true)}} #### Who are the source data producers? @@ -87,17 +94,17 @@ {{ who_are_annotators_section | default("[More Information Needed]", true)}} -## Bias, Risks, and Limitations +#### Personal and Sensitive Information - + -{{ bias_risks_limitations | default("[More Information Needed]", true)}} +{{ personal_and_sensitive_information | default("[More Information Needed]", true)}} -### Privacy Considerations +## Bias, Risks, and Limitations - + -{{ privacy_considerations | default("[More Information Needed]", true)}} +{{ bias_risks_limitations | default("[More Information Needed]", true)}} ### Recommendations diff --git a/src/huggingface_hub/templates/modelcard_template.md b/src/huggingface_hub/templates/modelcard_template.md index 6ad48df1c7..8c9243fbd6 100644 --- a/src/huggingface_hub/templates/modelcard_template.md +++ b/src/huggingface_hub/templates/modelcard_template.md @@ -19,6 +19,7 @@ {{ model_description | default("", true) }} - **Developed by:** {{ developers | default("[More Information Needed]", true)}} +- **Funded by [optional]:** {{ funded_by | default("[More Information Needed]", true)}} - **Shared by [optional]:** {{ shared_by | default("[More Information Needed]", true)}} - **Model type:** {{ model_type | default("[More Information Needed]", true)}} - **Language(s) (NLP):** {{ language | default("[More Information Needed]", true)}}