Skip to content

Commit

Permalink
Merge branch 'main' into transformers_future
Browse files Browse the repository at this point in the history
  • Loading branch information
regisss committed Sep 24, 2024
2 parents bf89e41 + 00dd5bf commit 98b0da5
Show file tree
Hide file tree
Showing 76 changed files with 3,080 additions and 800 deletions.
23 changes: 21 additions & 2 deletions examples/image-to-text/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ Models that have been validated:
- [llava-hf/llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)
- [llava-hf/llava-v1.6-vicuna-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-7b-hf)
- [llava-hf/llava-v1.6-vicuna-13b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf)
- [llava-hf/llava-v1.6-34b-hf](https://huggingface.co/llava-hf/llava-v1.6-34b-hf)
- [llava-hf/llama3-llava-next-8b-hf](https://huggingface.co/llava-hf/llama3-llava-next-8b-hf)

### Inference with BF16

Expand Down Expand Up @@ -72,9 +74,26 @@ python3 run_pipeline.py \
--bf16
```

### Inference with FP8
To run Llava-hf/llava-v1.6-34b-hf inference, use the following command:

```bash
python3 run_pipeline.py \
--model_name_or_path llava-hf/llava-v1.6-34b-hf \
--use_hpu_graphs \
--bf16
```

To run Llava-hf/llama3-llava-next-8b-hf inference, use the following command:

Inference for Llava-1.5-7b, Llava-1.5-13b, Llava-v1.6-mistral-7b and Llava-v1.6-vicuna-13b in FP8 precision are enabled using the Quantization Toolkit (HQT), which provides model measurement and quantization capabilities in PyTorch.
```bash
python3 run_pipeline.py \
--model_name_or_path llava-hf/llama3-llava-next-8b-hf \
--use_hpu_graphs \
--bf16
```

### Inference with FP8
Inference for Llava-1.5-7b, Llava-1.5-13b, Llava-v1.6-mistral-7b and Llava-v1.6-vicuna-13b in FP8 precision are enabled using [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html), which provides model measurement and quantization capabilities in PyTorch. INC is used by default for measuring and quantization. Habana Quantization Toolkit (HQT), which was used earlier, will be removed in future releases. To use HQT, disable INC by setting the following environment variable: `USE_INC=0`.

More information on enabling FP8 in SynapseAI is available here:
https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html
Expand Down
80 changes: 66 additions & 14 deletions examples/image-to-text/run_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import PIL.Image
import requests
import torch
from transformers import AutoConfig, pipeline
from transformers import AutoConfig, LlavaNextProcessor, LlavaProcessor, pipeline

from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi

Expand All @@ -36,6 +36,46 @@
logger = logging.getLogger(__name__)


def setup_quantization(model, args):
if os.getenv("USE_INC", "1") != "0":
try:
from neural_compressor.torch.quantization import FP8Config, convert, prepare
except ImportError:
raise ImportError(
"Module neural_compressor is missing. Please use a newer Synapse version to use quantization, or set the environment variable to USE_INC=0"
)

config = FP8Config.from_json_file(args.quant_config)
if config.measure:
model = prepare(model, config)
elif config.quantize:
model = convert(model, config)
else:
import habana_frameworks.torch.core as htcore
import habana_quantization_toolkit

habana_quantization_toolkit.prep_model(model)
htcore.hpu_initialize(model)

return model


def finalize_quantization(model):
if os.getenv("USE_INC", "1") != "0":
try:
from neural_compressor.torch.quantization import finalize_calibration
except ImportError:
raise ImportError(
"Module neural_compressor is missing. Please use a newer Synapse version to use quantization, or set the environment variable to USE_INC=0"
)

finalize_calibration(model)
else:
import habana_quantization_toolkit

habana_quantization_toolkit.finish_measurements(model)


def main():
parser = argparse.ArgumentParser()

Expand Down Expand Up @@ -101,6 +141,11 @@ def main():
action="store_true",
help="Whether to enable Habana Flash Attention in recompute mode on first token generation. This gives an opportunity of splitting graph internally which helps reduce memory consumption.",
)
parser.add_argument(
"--use_kv_cache",
action="store_true",
help="Whether to use the key/value cache for decoding. It should speed up generation.",
)

args = parser.parse_args()

Expand All @@ -116,12 +161,21 @@ def main():
args.image_path = [
"https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
]
if args.prompt is None and model_type == "llava":
args.prompt = "<image>\nUSER: What's the content of the image?\nASSISTANT:"
elif args.prompt is None and model_type == "llava_next":
args.prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
if args.model_name_or_path in ["llava-hf/llava-v1.6-vicuna-13b-hf", "llava-hf/llava-v1.6-vicuna-7b-hf"]:
args.prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>\nWhat is shown in this image? ASSISTANT:"
if args.prompt is None:
if model_type == "llava":
processor = LlavaProcessor.from_pretrained(args.model_name_or_path)
elif model_type == "llava_next":
processor = LlavaNextProcessor.from_pretrained(args.model_name_or_path)
conversation = [
{
"role": "user",
"content": [
{"type": "text", "text": "What is shown in this image?"},
{"type": "image"},
],
}
]
args.prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

image_paths = args.image_path
image_paths_len = len(image_paths)
Expand Down Expand Up @@ -157,6 +211,7 @@ def main():
)
generate_kwargs = {
"lazy_mode": True,
"use_cache": args.use_kv_cache,
"hpu_graphs": args.use_hpu_graphs,
"max_new_tokens": args.max_new_tokens,
"ignore_eos": args.ignore_eos,
Expand All @@ -169,18 +224,14 @@ def main():
generator.model = wrap_in_hpu_graph(generator.model)

if args.quant_config:
import habana_quantization_toolkit

habana_quantization_toolkit.prep_model(generator.model)

htcore.hpu_initialize(generator.model)
generator.model = setup_quantization(generator.model, args)

# warm up
for i in range(args.warmup):
generator(images, prompt=args.prompt, batch_size=args.batch_size, generate_kwargs=generate_kwargs)
torch.hpu.synchronize()
if args.quant_config:
habana_quantization_toolkit.finish_measurements(generator.model)
finalize_quantization(generator.model)

start = time.perf_counter()
for i in range(args.n_iterations):
Expand All @@ -197,8 +248,9 @@ def main():

total_new_tokens_generated = args.n_iterations * n_output_tokens
throughput = total_new_tokens_generated / duration
logger.info(f"result = {result}")
logger.info(
f"result = {result}, time = {(end-start) * 1000 / args.n_iterations }ms, Throughput (including tokenization) = {throughput} tokens/second"
f"time = {(end-start) * 1000 / args.n_iterations }ms, Throughput (including tokenization) = {throughput} tokens/second"
)

# Store results if necessary
Expand Down
20 changes: 16 additions & 4 deletions examples/language-modeling/run_lora_clm.py
Original file line number Diff line number Diff line change
Expand Up @@ -701,8 +701,16 @@ def main():
tokenizer.pad_token_id = tokenizer.eos_token_id

def tokenize(prompt, add_eos_token=True, add_bos_token=True):
add_eos_token_o = tokenizer.add_eos_token
add_bos_token_o = tokenizer.add_bos_token
if hasattr(tokenizer, "add_eos_token"):
add_eos_token_o = tokenizer.add_eos_token
else:
add_eos_token_o = None

if hasattr(tokenizer, "add_bos_token"):
add_bos_token_o = tokenizer.add_bos_token
else:
add_bos_token_o = None

if not data_args.dataset_concatenation:
tokenizer.add_eos_token = add_eos_token
padding = "max_length"
Expand All @@ -717,8 +725,12 @@ def tokenize(prompt, add_eos_token=True, add_bos_token=True):
return_tensors=None,
)
# restore original value
tokenizer.add_eos_token = add_eos_token_o
tokenizer.add_bos_token = add_bos_token_o
if add_eos_token_o is not None:
tokenizer.add_eos_token = add_eos_token_o

if add_bos_token_o is not None:
tokenizer.add_bos_token = add_bos_token_o

for i in range(len(results["input_ids"])):
if (
results["input_ids"][i][-1] != tokenizer.eos_token_id
Expand Down
3 changes: 3 additions & 0 deletions examples/protein-folding/run_esmfold.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ def convert_outputs_to_pdb(outputs):
test_protein = "MGAGASAEEKHSRELEKKLKEDAEKDARTVKLLLLGAGESGKSTIVKQMKIIHQDGYSLEECLEFIAIIYGNTLQSILAIVRAMTTLNIQYGDSARQDDARKLMHMADTIEEGTMPKEMSDIIQRLWKDSGIQACFERASEYQLNDSAGYYLSDLERLVTPGYVPTEQDVLRSRVKTTGIIETQFSFKDLNFRMFDVGGQRSERKKWIHCFEGVTCIIFIAALSAYDMVLVEDDEVNRMHESLHLFNSICNHRYFATTSIVLFLNKKDVFFEKIKKAHLSICFPDYDGPNTYEDAGNYIKVQFLELNMRRDVKEIYSHMTCATDTQNVKFVFDAVTDIIIKENLKDCGLF" # len = 350

tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
# Set _supports_param_buffer_assignment to False since facebook/esmfold_v1's encoder weights are float16.
# Without this fix, we will have the weights loaded with float16 on gaudi2,gaudi3 and runtime error on gaudi1
EsmForProteinFolding._supports_param_buffer_assignment = False
model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1", low_cpu_mem_usage=False)
model = model.to(device)

Expand Down
28 changes: 26 additions & 2 deletions examples/sentence-transformers-training/nli/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ Given two sentences (premise and hypothesis), the task of Natural Language Infer

The paper in [Conneau et al.](https://arxiv.org/abs/1705.02364) shows that NLI data can be quite useful when training Sentence Embedding methods. In [Sentence-BERT-Paper](https://arxiv.org/abs/1908.10084) NLI as a first fine-tuning step for sentence embedding methods has been used.

# General Models

## Single-card Training

To pre-train on the NLI task:
Expand Down Expand Up @@ -46,7 +48,29 @@ For multi-card training you can use the script of [gaudi_spawn.py](https://githu
HABANA_VISIBLE_MODULES="2,3" python ../../gaudi_spawn.py --use_deepspeed --world_size 2 training_nli.py bert-base-uncased
```

## Dataset

# Large Models (intfloat/e5-mistral-7b-instruct)

## Single-card Training with LoRA+gradient_checkpointing

Pretraining the `intfloat/e5-mistral-7b-instruct` model requires approximately 130GB of memory, which exceeds the capacity of a single HPU (Gaudi 2 with 98GB memory). To address this, we can utilize LoRA and gradient checkpointing techniques to reduce the memory requirements, making it feasible to train the model on a single HPU.

```bash
python training_nli.py intfloat/e5-mistral-7b-instruct --peft --lora_target_module "q_proj" "k_proj" "v_proj" --learning_rate 1e-5
```

## Multi-card Training with Deepspeed Zero2/3

Pretraining the `intfloat/e5-mistral-7b-instruct` model requires approximately 130GB of memory, which exceeds the capacity of a single HPU (Gaudi 2 with 98GB memory). To address this, we can use the Zero2/Zero3 stages of DeepSpeed (model parallelism) to reduce the memory requirements.

Our tests have shown that training this model requires at least four HPUs when using DeepSpeed Zero2.

```bash
python ../../gaudi_spawn.py --world_size 4 --use_deepspeed training_nli.py intfloat/e5-mistral-7b-instruct --deepspeed ds_config.json --bf16 --no-use_hpu_graphs_for_training --learning_rate 1e-7
```
In the above command, we need to enable lazy mode with a learning rate of `1e-7` and configure DeepSpeed using the `ds_config.json` file. To further reduce memory usage, change the stage to 3 (DeepSpeed Zero3) in the `ds_config.json` file.

# Dataset

We combine [SNLI](https://huggingface.co/datasets/stanfordnlp/snli) and [MultiNLI](https://huggingface.co/datasets/nyu-mll/multi_nli) into a dataset we call [AllNLI](https://huggingface.co/datasets/sentence-transformers/all-nli). These two datasets contain sentence pairs and one of three labels: entailment, neutral, contradiction:

Expand All @@ -58,7 +82,7 @@ We combine [SNLI](https://huggingface.co/datasets/stanfordnlp/snli) and [MultiNL

We format AllNLI in a few different subsets, compatible with different loss functions. See [triplet subset of AllNLI](https://huggingface.co/datasets/sentence-transformers/all-nli/viewer/triplet) as example.

## SoftmaxLoss
# SoftmaxLoss

<img src="https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/SBERT_SoftmaxLoss.png" alt="SBERT SoftmaxLoss" width="250"/>

Expand Down
16 changes: 16 additions & 0 deletions examples/sentence-transformers-training/nli/ds_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"steps_per_print": 1,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"gradient_accumulation_steps": "auto",
"bf16": {
"enabled": true
},
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": 2,
"overlap_comm": false,
"reduce_scatter": false,
"contiguous_gradients": false
}
}
58 changes: 46 additions & 12 deletions examples/sentence-transformers-training/nli/training_nli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
STS benchmark dataset
"""

import argparse
import logging
import sys
from datetime import datetime

from datasets import load_dataset
Expand All @@ -28,16 +28,43 @@ def main():
logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)

# You can specify any Hugging Face pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
model_name = sys.argv[1] if len(sys.argv) > 1 else "bert-base-uncased"
train_batch_size = 16
parser = argparse.ArgumentParser()
parser.add_argument("model_name", help="model name or path", default="bert-base-uncased", nargs="?")
parser.add_argument("--peft", help="use LoRA", action="store_true", default=False)
parser.add_argument("--lora_target_modules", nargs="+", default=["query", "key", "value"])
parser.add_argument("--bf16", help="use bf16", action="store_true", default=False)
parser.add_argument(
"--use_hpu_graphs_for_training",
help="use hpu graphs for training",
action=argparse.BooleanOptionalAction,
default=True,
)
parser.add_argument("--learning_rate", help="learning rate", type=float, default=5e-5)
parser.add_argument("--deepspeed", help="deepspeed config file", default=None)
parser.add_argument("--train_batch_size", help="train batch size", default=16, type=int)
args = parser.parse_args()

output_dir = (
"output/training_nli_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
"output/training_nli_" + args.model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)

# 1. Here we define our SentenceTransformer model. If not already a Sentence Transformer model, it will automatically
# create one with "mean" pooling.
model = SentenceTransformer(model_name)
model = SentenceTransformer(args.model_name)
if args.peft:
from peft import LoraConfig, get_peft_model

peft_config = LoraConfig(
r=16,
lora_alpha=64,
lora_dropout=0.05,
bias="none",
inference_mode=False,
target_modules=args.lora_target_modules,
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})

# 2. Load the AllNLI dataset: https://huggingface.co/datasets/sentence-transformers/all-nli
# We'll start with 10k training samples, but you can increase this to get a stronger model
Expand Down Expand Up @@ -66,16 +93,16 @@ def main():
dev_evaluator(model)

# 5. Define the training arguments
args = SentenceTransformerGaudiTrainingArguments(
stargs = SentenceTransformerGaudiTrainingArguments(
# Required parameter:
output_dir=output_dir,
# Optional training parameters:
num_train_epochs=1,
per_device_train_batch_size=train_batch_size,
per_device_eval_batch_size=train_batch_size,
per_device_train_batch_size=args.train_batch_size,
per_device_eval_batch_size=args.train_batch_size,
warmup_ratio=0.1,
# fp16=True, # Set to False if you get an error that your GPU can't run on FP16
# bf16=False, # Set to True if you have a GPU that supports BF16
bf16=args.bf16, # Set to True if you have a GPU that supports BF16
# Optional tracking/debugging parameters:
evaluation_strategy="steps",
eval_steps=100,
Expand All @@ -87,16 +114,18 @@ def main():
use_habana=True,
gaudi_config_name="Habana/bert-base-uncased",
use_lazy_mode=True,
use_hpu_graphs=True,
use_hpu_graphs=args.use_hpu_graphs_for_training,
use_hpu_graphs_for_inference=False,
use_hpu_graphs_for_training=True,
use_hpu_graphs_for_training=args.use_hpu_graphs_for_training,
dataloader_drop_last=True,
learning_rate=args.learning_rate,
deepspeed=args.deepspeed,
)

# 6. Create the trainer & start training
trainer = SentenceTransformerGaudiTrainer(
model=model,
args=args,
args=stargs,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
loss=train_loss,
Expand All @@ -119,6 +148,11 @@ def main():
final_output_dir = f"{output_dir}/final"
model.save(final_output_dir)

if args.peft:
model.eval()
model = model.merge_and_unload()
model.save_pretrained(f"{output_dir}/merged")


if __name__ == "__main__":
main()
Loading

0 comments on commit 98b0da5

Please sign in to comment.