Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add trust_remote_code for loading datasets in the audio classification example #1074

Merged
merged 6 commits into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions examples/audio-classification/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@ python ../gaudi_spawn.py \
--use_hpu_graphs_for_inference \
--gaudi_config_name Habana/wav2vec2 \
--throughput_warmup_steps 3 \
--bf16
--bf16 \
--trust_remote_code True
```

On 8 HPUs, this script should run in ~12 minutes and yield an accuracy of **80.49%**.
Expand Down Expand Up @@ -141,7 +142,8 @@ python ../gaudi_spawn.py \
--use_hpu_graphs_for_inference \
--gaudi_config_name Habana/wav2vec2 \
--throughput_warmup_steps 3 \
--deepspeed ../../tests/configs/deepspeed_zero_2.json
--deepspeed ../../tests/configs/deepspeed_zero_2.json \
--trust_remote_code True
```

[The documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) provides more information about how to use DeepSpeed within Optimum Habana.
Expand Down
8 changes: 5 additions & 3 deletions examples/audio-classification/run_audio_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,9 +167,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -254,12 +254,14 @@ def main():
data_args.dataset_config_name,
split=data_args.train_split_name,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
raw_datasets["eval"] = load_dataset(
data_args.dataset_name,
data_args.dataset_config_name,
split=data_args.eval_split_name,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)

if data_args.audio_column_name not in raw_datasets["train"].column_names:
Expand Down
13 changes: 7 additions & 6 deletions examples/contrastive-image-text/run_bridgetower.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -203,9 +203,9 @@ def __post_init__(self):
if self.validation_file is not None:
extension = self.validation_file.split(".")[-1]
assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
if self.validation_file is not None:
extension = self.validation_file.split(".")[-1]
assert extension == "json", "`validation_file` should be a json file."
if self.test_file is not None:
extension = self.test_file.split(".")[-1]
assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."


dataset_name_mapping = {
Expand Down Expand Up @@ -328,6 +328,7 @@ def main():
data_dir=data_args.data_dir,
token=model_args.token,
revision=data_args.dataset_revision,
trust_remote_code=model_args.trust_remote_code,
)
else:
data_files = {}
Expand Down
13 changes: 7 additions & 6 deletions examples/contrastive-image-text/run_clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -201,9 +201,9 @@ def __post_init__(self):
if self.validation_file is not None:
extension = self.validation_file.split(".")[-1]
assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
if self.validation_file is not None:
extension = self.validation_file.split(".")[-1]
assert extension == "json", "`validation_file` should be a json file."
if self.test_file is not None:
extension = self.test_file.split(".")[-1]
assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."


dataset_name_mapping = {
Expand Down Expand Up @@ -325,6 +325,7 @@ def main():
keep_in_memory=False,
data_dir=data_args.data_dir,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
else:
data_files = {}
Expand Down
7 changes: 4 additions & 3 deletions examples/image-classification/run_image_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,9 +172,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -259,6 +259,7 @@ def main():
data_args.dataset_config_name,
cache_dir=model_args.cache_dir,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
else:
data_files = {}
Expand Down
9 changes: 6 additions & 3 deletions examples/language-modeling/run_clm.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,9 +131,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -341,6 +341,7 @@ def main():
cache_dir=model_args.cache_dir,
token=model_args.token,
streaming=data_args.streaming,
trust_remote_code=model_args.trust_remote_code,
)
if "validation" not in raw_datasets.keys():
raw_datasets["validation"] = load_dataset(
Expand All @@ -350,6 +351,7 @@ def main():
cache_dir=model_args.cache_dir,
token=model_args.token,
streaming=data_args.streaming,
trust_remote_code=model_args.trust_remote_code,
)
raw_datasets["train"] = load_dataset(
data_args.dataset_name,
Expand All @@ -358,6 +360,7 @@ def main():
cache_dir=model_args.cache_dir,
token=model_args.token,
streaming=data_args.streaming,
trust_remote_code=model_args.trust_remote_code,
)
else:
data_files = {}
Expand Down
9 changes: 8 additions & 1 deletion examples/language-modeling/run_lora_clm.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,11 @@ class ModelArguments:
trust_remote_code: bool = field(
default=False,
metadata={
"help": "should enable when using custom model architecture that is not yet part of the Hugging Face transformers package like MPT)."
"help": (
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
use_cache: bool = field(
Expand Down Expand Up @@ -502,6 +506,7 @@ def main():
data_args.dataset_config_name,
cache_dir=model_args.cache_dir,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)

if "validation" not in raw_datasets.keys() and training_args.do_eval:
Expand All @@ -511,13 +516,15 @@ def main():
split=f"train[:{data_args.validation_split_percentage}%]",
cache_dir=model_args.cache_dir,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
raw_datasets["train"] = load_dataset(
data_args.dataset_name,
data_args.dataset_config_name,
split=f"train[{data_args.validation_split_percentage}%:]",
cache_dir=model_args.cache_dir,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
else:
data_files = {}
Expand Down
9 changes: 6 additions & 3 deletions examples/language-modeling/run_mlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -340,6 +340,7 @@ def main():
cache_dir=model_args.cache_dir,
token=model_args.token,
streaming=data_args.streaming,
trust_remote_code=model_args.trust_remote_code,
)
if "validation" not in raw_datasets.keys():
raw_datasets["validation"] = load_dataset(
Expand All @@ -349,6 +350,7 @@ def main():
cache_dir=model_args.cache_dir,
token=model_args.token,
streaming=data_args.streaming,
trust_remote_code=model_args.trust_remote_code,
)
raw_datasets["train"] = load_dataset(
data_args.dataset_name,
Expand All @@ -357,6 +359,7 @@ def main():
cache_dir=model_args.cache_dir,
token=model_args.token,
streaming=data_args.streaming,
trust_remote_code=model_args.trust_remote_code,
)
else:
data_files = {}
Expand Down
7 changes: 4 additions & 3 deletions examples/language-modeling/run_prompt_tuning_clm.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -248,6 +248,7 @@ def main():
cache_dir=model_args.cache_dir,
token=model_args.token,
streaming=data_args.streaming,
trust_remote_code=model_args.trust_remote_code,
)
if data_args.dataset_name == "ought/raft" and data_args.dataset_config_name == "twitter_complaints":
text_column = "Tweet text"
Expand Down
7 changes: 4 additions & 3 deletions examples/question-answering/run_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -319,6 +319,7 @@ def main():
data_args.dataset_config_name,
cache_dir=model_args.cache_dir,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
else:
data_files = {}
Expand Down
7 changes: 4 additions & 3 deletions examples/question-answering/run_seq2seq_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -364,6 +364,7 @@ def main():
data_args.dataset_config_name,
cache_dir=model_args.cache_dir,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
else:
data_files = {}
Expand Down
8 changes: 5 additions & 3 deletions examples/speech-recognition/run_speech_recognition_ctc.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,9 +261,9 @@ class DataTrainingArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -467,6 +467,7 @@ def main():
data_args.dataset_config_name,
split=data_args.train_split_name,
token=data_args.token,
trust_remote_code=data_args.trust_remote_code,
)

if data_args.audio_column_name not in raw_datasets["train"].column_names:
Expand All @@ -492,6 +493,7 @@ def main():
data_args.dataset_config_name,
split=data_args.eval_split_name,
token=data_args.token,
trust_remote_code=data_args.trust_remote_code,
)

if data_args.max_eval_samples is not None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -372,6 +372,7 @@ def main():
split=data_args.train_split_name,
cache_dir=model_args.cache_dir,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)

if training_args.do_eval:
Expand All @@ -381,6 +382,7 @@ def main():
split=data_args.eval_split_name,
cache_dir=model_args.cache_dir,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)

if data_args.audio_column_name not in next(iter(raw_datasets.values())).column_names:
Expand Down
7 changes: 4 additions & 3 deletions examples/summarization/run_summarization.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,9 @@ class ModelArguments:
default=False,
metadata={
"help": (
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
"execute code present on the Hub on your local machine."
"Whether to trust the execution of code from datasets/models defined on the Hub."
" This option should only be set to `True` for repositories you trust and in which you have read the"
" code, as it will execute code present on the Hub on your local machine."
)
},
)
Expand Down Expand Up @@ -428,6 +428,7 @@ def main():
data_args.dataset_config_name,
cache_dir=model_args.cache_dir,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
)
else:
data_files = {}
Expand Down
Loading
Loading