diff --git a/serving/docker/partition/partition.py b/serving/docker/partition/partition.py index 5421a5f2d..0afa3e36c 100644 --- a/serving/docker/partition/partition.py +++ b/serving/docker/partition/partition.py @@ -24,8 +24,6 @@ import utils from properties_manager import PropertiesManager from huggingface_hub import snapshot_download -from awq import AutoAWQForCausalLM -from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig from datasets import load_dataset from utils import (get_partition_cmd, extract_python_jar, @@ -266,14 +264,21 @@ def autoawq_quantize(self): "version": "GEMM" } logging.info(f"Model loading kwargs: {hf_configs.kwargs}") - awq_model = AutoAWQForCausalLM.from_pretrained( - hf_configs.model_id_or_path, **hf_configs.kwargs) - awq_model.quantize(tokenizer, quant_config=quant_config) - - output_path = self.properties['option.save_mp_checkpoint_path'] - logging.info(f"Saving model and tokenizer to: {output_path}") - awq_model.save_quantized(output_path) - tokenizer.save_pretrained(output_path) + try: + from awq import AutoAWQForCausalLM + awq_model = AutoAWQForCausalLM.from_pretrained( + hf_configs.model_id_or_path, **hf_configs.kwargs) + awq_model.quantize(tokenizer, quant_config=quant_config) + + output_path = self.properties['option.save_mp_checkpoint_path'] + logging.info(f"Saving model and tokenizer to: {output_path}") + awq_model.save_quantized(output_path) + tokenizer.save_pretrained(output_path) + except ImportError: + logging.error( + "AutoAWQ is not installed. Failing during quantization.") + raise ImportError( + "AutoAWQ is not installed. Failing during quantization.") def autofp8_quantize(self, config: Optional[dict] = None): """ @@ -304,17 +309,25 @@ def autofp8_quantize(self, config: Optional[dict] = None): truncation=True, return_tensors="pt").to("cuda") - quantize_config = BaseQuantizeConfig(**config) - logging.info( - f"Using the following configurations for fp8 quantization: {vars(quantize_config)}" - ) - model = AutoFP8ForCausalLM.from_pretrained(hf_configs.model_id_or_path, - quantize_config, - **hf_configs.kwargs) - model.quantize(examples) - output_path = self.properties['option.save_mp_checkpoint_path'] - logging.info(f"Quantization complete. Saving model to: {output_path}") - model.save_quantized(output_path) + try: + from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig + quantize_config = BaseQuantizeConfig(**config) + logging.info( + f"Using the following configurations for fp8 quantization: {vars(quantize_config)}" + ) + model = AutoFP8ForCausalLM.from_pretrained( + hf_configs.model_id_or_path, quantize_config, + **hf_configs.kwargs) + model.quantize(examples) + output_path = self.properties['option.save_mp_checkpoint_path'] + logging.info( + f"Quantization complete. Saving model to: {output_path}") + model.save_quantized(output_path) + except ImportError: + logging.error( + "AutoFP8 is not installed. Failing during quantization.") + raise ImportError( + "AutoFP8 is not installed. Failing during quantization.") def main(): diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py index 642551e5b..3f88b548e 100644 --- a/tests/integration/llm/client.py +++ b/tests/integration/llm/client.py @@ -171,19 +171,10 @@ def get_model_name(): "batch_size": [1], "seq_length": [256], "tokenizer": "TheBloke/Llama-2-13B-fp16" - } -} - -transformers_neuronx_aot_model_spec = { - "gpt2": { - "worker": 1, - "seq_length": [512], - "batch_size": [4] }, - "gpt2-quantize": { - "worker": 1, - "seq_length": [512], - "batch_size": [4] + "tiny-llama-rb": { + "batch_size": [1, 4], + "seq_length": [256], }, } @@ -1778,9 +1769,6 @@ def run(raw_args): transformers_neuronx_model_spec) elif args.handler == "transformers_neuronx_rolling_batch": test_handler_rolling_batch(args.model, transformers_neuronx_model_spec) - elif args.handler == "transformers_neuronx-aot": - test_transformers_neuronx_handler(args.model, - transformers_neuronx_aot_model_spec) elif args.handler == "transformers_neuronx_neo": test_transformers_neuronx_handler(args.model, transformers_neuronx_neo_model_spec) diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py index cda399844..1eec73382 100644 --- a/tests/integration/llm/prepare.py +++ b/tests/integration/llm/prepare.py @@ -76,50 +76,9 @@ } } -transformers_neuronx_aot_handler_list = { - "gpt2": { - "option.model_id": - "gpt2", - "option.batch_size": - 4, - "option.tensor_parallel_degree": - 2, - "option.n_positions": - 512, - "option.dtype": - "fp16", - "option.model_loading_timeout": - 600, - "option.enable_streaming": - False, - "option.save_mp_checkpoint_path": - "/opt/ml/input/data/training/partition-test" - }, - "gpt2-quantize": { - "option.model_id": - "gpt2", - "option.batch_size": - 4, - "option.tensor_parallel_degree": - 2, - "option.n_positions": - 512, - "option.dtype": - "fp16", - "option.model_loading_timeout": - 600, - "option.quantize": - "static_int8", - "option.enable_streaming": - False, - "option.save_mp_checkpoint_path": - "/opt/ml/input/data/training/partition-test" - }, -} - transformers_neuronx_handler_list = { "gpt2": { - "option.model_id": "gpt2", + "option.model_id": "s3://djl-llm/gpt2/", "max_dynamic_batch_size": 4, "option.tensor_parallel_degree": 2, "option.n_positions": 512, @@ -127,7 +86,7 @@ "option.model_loading_timeout": 600 }, "gpt2-quantize": { - "option.model_id": "gpt2", + "option.model_id": "s3://djl-llm/gpt2/", "batch_size": 4, "option.tensor_parallel_degree": 2, "option.n_positions": 512, @@ -276,6 +235,23 @@ "option.max_rolling_batch_size": 1, "option.model_loading_timeout": 3600, "option.output_formatter": "jsonlines" + }, + "tiny-llama-rb-aot": { + "option.model_id": "s3://djl-llm/tinyllama-1.1b-chat/", + "option.tensor_parallel_degree": 2, + "option.n_positions": 1024, + "option.max_rolling_batch_size": 4, + "option.rolling_batch": 'auto', + "option.model_loading_timeout": 1200, + }, + "tiny-llama-rb-aot-quant": { + "option.model_id": "s3://djl-llm/tinyllama-1.1b-chat/", + "option.quantize": "static_int8", + "option.tensor_parallel_degree": 2, + "option.n_positions": 1024, + "option.max_rolling_batch_size": 4, + "option.rolling_batch": 'auto', + "option.model_loading_timeout": 1200, } } @@ -1217,17 +1193,6 @@ def build_transformers_neuronx_handler_model(model): write_model_artifacts(options) -def build_transformers_neuronx_aot_handler_model(model): - if model not in transformers_neuronx_aot_handler_list.keys(): - raise ValueError( - f"{model} is not one of the supporting handler {list(transformers_neuronx_aot_handler_list.keys())}" - ) - options = transformers_neuronx_aot_handler_list[model] - options["engine"] = "Python" - options["option.entryPoint"] = "djl_python.transformers_neuronx" - write_model_artifacts(options) - - def build_rolling_batch_model(model): if model not in rolling_batch_model_list.keys(): raise ValueError( @@ -1364,7 +1329,6 @@ def build_text_embedding_model(model): supported_handler = { 'huggingface': build_hf_handler_model, 'transformers_neuronx': build_transformers_neuronx_handler_model, - 'transformers_neuronx_aot': build_transformers_neuronx_aot_handler_model, 'performance': build_performance_model, 'handler_performance': build_handler_performance_model, 'rolling_batch_scheduler': build_rolling_batch_model, diff --git a/tests/integration/tests.py b/tests/integration/tests.py index 3124980af..6f716c950 100644 --- a/tests/integration/tests.py +++ b/tests/integration/tests.py @@ -2,6 +2,7 @@ import os import subprocess +import logging import pytest import llm.prepare as prepare import llm.client as client @@ -62,7 +63,9 @@ def __exit__(self, *args): f"cp client_logs/{esc_test_name}_client.log all_logs/{esc_test_name}/ || true" ) os.system(f"cp -r logs all_logs/{esc_test_name}") - subprocess.run(["./remove_container.sh"], check=True) + subprocess.run(["./remove_container.sh"], + check=True, + capture_output=True) os.system("cat logs/serving.log") def launch(self, env_vars=None, container=None, cmd=None): @@ -715,34 +718,23 @@ def test_bloom(self): r.launch(container='pytorch-inf2-2') client.run("transformers_neuronx bloom-7b1".split()) - @pytest.mark.parametrize("model", ["gpt2", "gpt2-quantize"]) + @pytest.mark.parametrize("model", + ["tiny-llama-rb-aot", "tiny-llama-rb-aot-quant"]) def test_partition(self, model): - try: - with Runner('pytorch-inf2', f'partition-{model}') as r: + with Runner('pytorch-inf2', f'partition-{model}') as r: + try: prepare.build_transformers_neuronx_handler_model(model) - with open("models/test/requirements.txt", "a") as f: - f.write("dummy_test") - partition_output = r.launch( + r.launch( container="pytorch-inf2-1", cmd= - 'partition --model-dir /opt/ml/input/data/training/ --save-mp-checkpoint-path /opt/ml/input/data/training/partition --skip-copy' + "partition --model-dir /opt/ml/input/data/training --save-mp-checkpoint-path /opt/ml/input/data/training/aot --skip-copy" ) - - # Check if neff files are generated - if len([ - fn - for fn in os.listdir("models/test/partition/compiled") - if fn.endswith(".neff") - ]) == 0: - raise Exception("Failed to generate any .neff files") - - # Check whether requirements.txt download is sufficient - if 'pip install requirements succeed!' not in partition_output.stdout.decode( - "utf-8"): - raise Exception( - "Requirements.txt not installed successfully") - finally: - os.system('sudo rm -rf models') + r.launch(container="pytorch-inf2-1", + cmd="serve -m test=file:/opt/ml/model/test/aot") + client.run( + "transformers_neuronx_rolling_batch tiny-llama-rb".split()) + finally: + os.system('sudo rm -rf models') @pytest.mark.inf