huggingface · regisss · Sep 25, 2024 · Aug 20, 2024 · Aug 28, 2024 · Sep 25, 2024
@@ -362,7 +362,7 @@ python run_clm.py \
 
 ## PEFT
 
-### LORA/ADALORA/IA3/LLAMA_ADAPTER
+### LORA/ADALORA/IA3/LLAMA_ADAPTER/VERA/LN_TUNING
 
 To run LoRA finetuning, you can use `run_lora_clm.py`.
 Here are single-/multi-device command examples for Llama1-7B, Falcon-40B, Llama2-70B, Llama3-8B and Llama3-70B.
@@ -691,7 +691,7 @@ DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 LOWER_LIST=ops_bf16.txt python3 ..
     --validation_split_percentage 5 \
     --deepspeed ds_falcon_180b_z3.json
 ```
-Default `peft_type` is `lora`, you could enable adalora or ia3 using `--peft_type adalora` or `--peft_type ia3`, or enable llama-adapter for llama model using `--peft_type llama-adapter`.
+Default `peft_type` is `lora`, you could enable adalora or ia3 using `--peft_type adalora` or `--peft_type ia3`, or enable llama-adapter for llama model using `--peft_type llama-adapter`, or enable ln-tuning using `--peft_type ln_tuning`, or enable vera using `--peft_type vera`
 
 #### Custom Files
 

@@ -30,7 +30,17 @@
 import torch
 import transformers
 from datasets import load_dataset
-from peft import AdaLoraConfig, AdaptionPromptConfig, IA3Config, LoraConfig, TaskType, get_peft_model, tuners
+from peft import (
+    AdaLoraConfig,
+    AdaptionPromptConfig,
+    IA3Config,
+    LNTuningConfig,
+    LoraConfig,
+    TaskType,
+    VeraConfig,
+    get_peft_model,
+    tuners,
+)
 from peft.utils.other import fsdp_auto_wrap_policy
 from transformers import (
     AutoConfig,
@@ -345,7 +355,7 @@ class FinetuneArguments:
         default="lora",
         metadata={
             "help": ("The PEFT type to use."),
-            "choices": ["lora", "ia3", "adalora", "llama-adapter"],
+            "choices": ["lora", "ia3", "adalora", "llama-adapter", "vera", "ln_tuning"],
         },
     )
     ia3_target_modules: List[str] = field(
@@ -364,6 +374,14 @@ class FinetuneArguments:
         default=10,
         metadata={"help": "Number of adapter tokens to insert in llama-adapter"},
     )
+    vera_target_modules: List[str] = field(
+        default_factory=lambda: None,
+        metadata={"help": "Target modules for the vera method."},
+    )
+    ln_target_modules: List[str] = field(
+        default_factory=lambda: None,
+        metadata={"help": "Target modules for the ln method."},
+    )
 
 
 PROMPT_DICT = {
@@ -839,6 +857,15 @@ def compute_metrics(eval_preds):
 
             tuners.adaption_prompt.layer.AdaptedAttention.pre_attn_forward = GaudiAdaptedAttentionPreAttnForward
             tuners.adaption_prompt.layer.AdaptedAttention.__getattr__ = GaudiAdaptedAttention_getattr
+        elif finetune_args.peft_type == "vera":
+            peft_config = VeraConfig(
+                target_modules=finetune_args.vera_target_modules, task_type=TaskType.CAUSAL_LM, init_weights=False
+            )
+        elif finetune_args.peft_type == "ln_tuning":
+            peft_config = LNTuningConfig(
+                target_modules=finetune_args.ln_target_modules,
+                task_type=TaskType.CAUSAL_LM,
+            )
         if training_args.gradient_checkpointing:
             model.enable_input_require_grads()
         lora_model = get_peft_model(model, peft_config)

@@ -349,7 +349,7 @@
                         "--max_train_samples 1000",
                         "--use_habana",
                         "--ppo_epochs 1",
-			"--batched_gen True",
+                        "--batched_gen True",
                         "--mini_batch_size 1",
                         "--output_max_length 128",
                         "--input_max_length 128",
@@ -473,6 +473,146 @@
                     ]
                 }
             }
+        },
+        "ia3": {
+            "num_train_epochs": 3,
+            "eval_batch_size": 4,
+            "distribution": {
+                "multi_card": {
+                    "learning_rate": 3e-4,
+                    "train_batch_size": 8,
+                    "perplexity": 3.3,
+                    "train_runtime": 262.8,
+                    "train_samples_per_second": 161,
+                    "extra_arguments": [
+                        "--bf16",
+                        "--gradient_accumulation_steps 2",
+                        "--eval_strategy no",
+                        "--save_strategy no",
+                        "--warmup_ratio  0.03",
+                        "--lr_scheduler_type constant",
+                        "--max_grad_norm  0.3",
+                        "--logging_steps 1",
+                        "--use_hpu_graphs_for_inference",
+                        "--ia3_target_modules q_proj v_proj",
+                        "--dataset_concatenation",
+                        "--max_seq_length 512",
+                        "--low_cpu_mem_usage True",
+                        "--adam_epsilon 1e-08",
+                        "--ddp_bucket_cap_mb 50",
+                        "--validation_split_percentage 10",
+                        "--attn_softmax_bf16",
+                        "--peft_type ia3"
+                    ]
+                }
+            }
+        },
+        "adalora": {
+            "num_train_epochs": 3,
+            "eval_batch_size": 4,
+            "distribution": {
+                "multi_card": {
+                    "learning_rate": 3e-4,
+                    "train_batch_size": 8,
+                    "perplexity": 2.59,
+                    "train_runtime": 459,
+                    "train_samples_per_second": 107,
+                    "extra_arguments": [
+                        "--bf16",
+                        "--gradient_accumulation_steps 2",
+                        "--eval_strategy no",
+                        "--save_strategy no",
+                        "--warmup_ratio  0.03",
+                        "--lr_scheduler_type constant",
+                        "--max_grad_norm  0.3",
+                        "--logging_steps 1",
+                        "--use_hpu_graphs_for_inference",
+                        "--lora_alpha 16",
+                        "--lora_dropout 0.05",
+                        "--lora_target_modules q_proj v_proj",
+                        "--adalora_init_r 12",
+                        "--adalora_target_r 4",
+                        "--adalora_tinit 50",
+                        "--adalora_tfinal 500",
+                        "--adalora_delta_t 100",
+                        "--adalora_orth_reg_weight 0.5",
+                        "--dataset_concatenation",
+                        "--max_seq_length 512",
+                        "--low_cpu_mem_usage True",
+                        "--adam_epsilon 1e-08",
+                        "--ddp_bucket_cap_mb 50",
+                        "--validation_split_percentage 10",
+                        "--attn_softmax_bf16",
+                        "--peft_type adalora"
+                    ]
+                }
+            }
+        },
+        "vera": {
+            "num_train_epochs": 3,
+            "eval_batch_size": 4,
+            "distribution": {
+                "multi_card": {
+                    "learning_rate": 1e-2,
+                    "train_batch_size": 8,
+                    "perplexity": 8.68,
+                    "train_runtime": 318.7,
+                    "train_samples_per_second": 126.6,
+                    "extra_arguments": [
+                        "--bf16",
+                        "--gradient_accumulation_steps 1",
+                        "--eval_strategy no",
+                        "--save_strategy no",
+                        "--warmup_ratio  0.03",
+                        "--lr_scheduler_type constant",
+                        "--max_grad_norm  0.3",
+                        "--logging_steps 1",
+                        "--use_hpu_graphs_for_inference",
+                        "--vera_target_modules q_proj v_proj",
+                        "--dataset_concatenation",
+                        "--max_seq_length 512",
+                        "--low_cpu_mem_usage True",
+                        "--adam_epsilon 1e-08",
+                        "--ddp_bucket_cap_mb 50",
+                        "--validation_split_percentage 10",
+                        "--attn_softmax_bf16",
+                        "--peft_type vera"
+                    ]
+                }
+            }
+        },
+        "ln_tuning": {
+            "num_train_epochs": 3,
+            "eval_batch_size": 4,
+            "distribution": {
+                "multi_card": {
+                    "learning_rate": 3e-4,
+                    "train_batch_size": 8,
+                    "perplexity": 2.83,
+                    "train_runtime": 249,
+                    "train_samples_per_second": 165,
+                    "extra_arguments": [
+                        "--bf16",
+                        "--gradient_accumulation_steps 2",
+                        "--eval_strategy no",
+                        "--save_strategy no",
+                        "--warmup_ratio  0.03",
+                        "--lr_scheduler_type constant",
+                        "--max_grad_norm  0.3",
+                        "--logging_steps 1",
+                        "--use_hpu_graphs_for_inference",
+                        "--ln_target_module input_layernorm post_attention_layernorm norm",
+                        "--dataset_concatenation",
+                        "--max_seq_length 512",
+                        "--low_cpu_mem_usage True",
+                        "--adam_epsilon 1e-08",
+                        "--ddp_bucket_cap_mb 50",
+                        "--validation_split_percentage 10",
+                        "--attn_softmax_bf16",
+                        "--peft_type ln_tuning"
+                    ]
+                }
+            }
         }
     }
 }
@@ -246,7 +246,14 @@ def to_test(
             return False
         elif ("qwen2" in model_name or "Qwen2" in model_name) and task_name == "trl-sft":
             return False
-        elif "falcon" in model_name and task_name in ("llama-adapter", "databricks/databricks-dolly-15k"):
+        elif "falcon" in model_name and task_name in (
+            "llama-adapter",
+            "databricks/databricks-dolly-15k",
+            "vera",
+            "ia3",
+            "adalora",
+            "ln_tuning",
+        ):
             return False
         elif model_name not in models_with_specific_rules and not deepspeed:
             return True
@@ -899,3 +906,31 @@ class MultiCardCausalLanguageModelingLoRAFP8ExampleTester(
 ):
     TASK_NAME = "tatsu-lab/alpaca_fp8"
     DATASET_NAME = "tatsu-lab/alpaca"
+
+
+class MultiCardCausalLanguageModelingVeraExampleTester(
+    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_lora_clm", multi_card=True
+):
+    TASK_NAME = "vera"
+    DATASET_NAME = "tatsu-lab/alpaca"
+
+
+class MultiCardCausalLanguageModelingLnExampleTester(
+    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_lora_clm", multi_card=True
+):
+    TASK_NAME = "ln_tuning"
+    DATASET_NAME = "tatsu-lab/alpaca"
+
+
+class MultiCardCausalLanguageModelingIA3ExampleTester(
+    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_lora_clm", multi_card=True
+):
+    TASK_NAME = "ia3"
+    DATASET_NAME = "tatsu-lab/alpaca"
+
+
+class MultiCardCausalLanguageModelingAdaloraExampleTester(
+    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_lora_clm", multi_card=True
+):
+    TASK_NAME = "adalora"
+    DATASET_NAME = "tatsu-lab/alpaca"