-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'dev' into trtllm-mistral
- Loading branch information
Showing
12 changed files
with
440 additions
and
497 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,115 +1,147 @@ | ||
import argparse | ||
import logging | ||
import os | ||
import sys | ||
import time | ||
from collections import defaultdict | ||
|
||
import numpy as np | ||
import torch | ||
from exllamav2 import ExLlamaV2, ExLlamaV2Cache | ||
from exllamav2.config import ExLlamaV2Config | ||
from exllamav2.generator import ExLlamaV2BaseGenerator, ExLlamaV2Sampler | ||
from exllamav2.tokenizer.tokenizer import ExLlamaV2Tokenizer | ||
from transformers import AutoTokenizer | ||
|
||
logging.getLogger("llama_cpp").setLevel(logging.ERROR) | ||
logging.basicConfig( | ||
stream=sys.stdout, | ||
level=logging.INFO, | ||
format="%(asctime)s - %(levelname)s - %(message)s", | ||
) | ||
sys.path.append(os.getcwd()) | ||
|
||
from common.base import BaseBenchmarkClass # noqa | ||
from common.utils import launch_cli, make_report # noqa | ||
|
||
class ExllamaV2Benchmark: | ||
def __init__(self, model_path: str) -> None: | ||
self.model_path, self.results = model_path, [] | ||
|
||
def load_model(self): | ||
class ExLlamaV2Benchmark(BaseBenchmarkClass): | ||
def __init__( | ||
self, | ||
model_path: str, | ||
model_name: str, | ||
benchmark_name: str, | ||
precision: str, | ||
device: str, | ||
experiment_name: str, | ||
) -> None: | ||
assert precision in ["int8", "int4"], ValueError( | ||
"Available precision: 'int8', 'int4'" | ||
) | ||
super().__init__( | ||
model_name=model_name, | ||
model_path=model_path, | ||
benchmark_name=benchmark_name, | ||
experiment_name=experiment_name, | ||
precision=precision, | ||
device=device, | ||
) | ||
|
||
def load_model_and_tokenizer(self): | ||
# set up model config | ||
self.config = ExLlamaV2Config() | ||
self.config.model_dir = self.model_path | ||
self.config.prepare() | ||
|
||
self.model = ExLlamaV2(self.config) | ||
self.cache = ExLlamaV2Cache(self.model, lazy=True) | ||
self.model.load_autosplit(self.cache) | ||
self.tokenizer = ExLlamaV2Tokenizer(self.config) | ||
# set up model and cache | ||
self._model = ExLlamaV2(self.config) | ||
self.cache = ExLlamaV2Cache(self._model, lazy=True) | ||
self._model.load_autosplit(self.cache) | ||
self.tokenizer_exllama = ExLlamaV2Tokenizer(self.config) | ||
self.model = ExLlamaV2BaseGenerator( | ||
self._model, self.cache, self.tokenizer_exllama | ||
) | ||
self.model.warmup() | ||
|
||
self.generator = ExLlamaV2BaseGenerator(self.model, self.cache, self.tokenizer) | ||
# set up the huggingface tokenizer | ||
self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) | ||
|
||
# set up exllamav2 settings | ||
self.settings = ExLlamaV2Sampler.Settings() | ||
self.settings.temperature = 0.85 | ||
self.settings.top_k = 50 | ||
self.settings.top_p = 0.8 | ||
self.settings.token_repetition_penalty = 1.05 | ||
self.settings.disallow_tokens(self.tokenizer, [self.tokenizer.eos_token_id]) | ||
self.generator.warmup() | ||
self.settings.disallow_tokens( | ||
self.tokenizer_exllama, [self.tokenizer_exllama.eos_token_id] | ||
) | ||
return self | ||
|
||
@torch.inference_mode() | ||
def run_model(self, prompt: str, max_tokens: int) -> float: | ||
start = time.time() | ||
_ = self.generator.generate_simple(prompt, self.settings, max_tokens, seed=1234) | ||
delta = time.time() - start | ||
return len(self.generator.sequence_ids[0]) / delta | ||
|
||
def benchmark(self, prompt: str, max_tokens: int, repetitions: int) -> None: | ||
for i in range(repetitions): | ||
logging.info( | ||
f"Running repetition [{str(i+1).zfill(len(str(repetitions)))}/{repetitions}]" | ||
def preprocess( | ||
self, prompt: str, chat_mode: bool = True, for_benchmarks: bool = True | ||
): | ||
if chat_mode: | ||
template = self.get_chat_template_with_instruction( | ||
prompt=prompt, for_benchmarks=for_benchmarks | ||
) | ||
tokens_per_second = self.run_model(prompt, max_tokens) | ||
self.results.append(tokens_per_second) | ||
prompt = self.tokenizer.apply_chat_template(template, tokenize=False) | ||
tokenized_input = self.tokenizer.encode(text=prompt) | ||
return { | ||
"prompt": prompt, | ||
"input_tokens": tokenized_input, | ||
"tensor": None, | ||
"num_input_tokens": len(tokenized_input), | ||
} | ||
|
||
def run_model(self, inputs: dict, max_tokens: int, temperature: float) -> dict: | ||
# first set up the settings | ||
self.settings.token_repetition_penalty = 1.01 | ||
self.settings.temperature = temperature | ||
self.settings.top_k = 50 | ||
self.settings.top_p = 0.1 | ||
|
||
# now run the model | ||
prompt = inputs["prompt"] | ||
output_text = self.model.generate_simple( | ||
prompt, | ||
self.settings, | ||
max_tokens, | ||
seed=1234, | ||
completion_only=True, | ||
decode_special_tokens=True, | ||
) | ||
|
||
tokenized_output = self.tokenizer.encode(output_text) | ||
return { | ||
"output_text": output_text, | ||
"output_tokens": tokenized_output, | ||
"num_output_tokens": len(tokenized_output), | ||
} | ||
|
||
def postprocess(self, output: dict) -> str: | ||
return output["output_text"] | ||
|
||
def on_exit(self): | ||
if self.device == "cuda": | ||
del self.model | ||
torch.cuda.synchronize() | ||
else: | ||
del self.model | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description="llama.cpp Benchmark Llama model.") | ||
parser.add_argument( | ||
"--prompt", | ||
type=str, | ||
help="The prompt for the model.", | ||
) | ||
parser.add_argument("--max_tokens", type=int, help="The maximum number of tokens.") | ||
parser.add_argument( | ||
"--repetitions", | ||
type=int, | ||
help="The number of repetitions for the benchmark.", | ||
) | ||
parser.add_argument( | ||
"--log_file", | ||
type=str, | ||
help="Path to the log file for writing logs (in append mode).", | ||
) | ||
parser.add_argument( | ||
"--models_dir", | ||
type=str, | ||
help="Path to the models directory.", | ||
) | ||
parser = launch_cli(description="ExLlamaV2 Benchmark.") | ||
args = parser.parse_args() | ||
logging.info( | ||
f"Running benchmark with: max_tokens={args.max_tokens} prompt={args.prompt} " | ||
+ f"repetitions={args.repetitions} device=cuda" | ||
|
||
model_folder = os.path.join(os.getcwd(), "models") | ||
model_name = ( | ||
f"{args.model_name}-2-7b-chat-exllamav2-" | ||
if args.model_name == "llama" | ||
else f"{args.model_name}-7b-v0.1-instruct-exllamav2-" | ||
) | ||
report = defaultdict(lambda: defaultdict(float)) | ||
for quantize in ("q8", "q4"): | ||
logging.info(f"Running ExllamaV2 benchmark with {quantize}") | ||
llamacpp_bench = ExllamaV2Benchmark( | ||
f"{args.models_dir}/llama-2-7b-exllamav2-{quantize}" | ||
).load_model() | ||
llamacpp_bench.benchmark( | ||
max_tokens=args.max_tokens, prompt=args.prompt, repetitions=args.repetitions | ||
) | ||
q = "int8" if quantize == "q8" else "int4" | ||
report["exllamav2"][q] = { | ||
"mean": np.mean(llamacpp_bench.results), | ||
"std": np.std(llamacpp_bench.results), | ||
} | ||
|
||
logging.info("Benchmark report") | ||
with open(args.log_file, "a") as file: | ||
for framework, quantizations in report.items(): | ||
for quantization, stats in quantizations.items(): | ||
logging.info( | ||
f"{framework}, {quantization}: {stats['mean']:.2f} ± {stats['std']:.2f}" | ||
) | ||
print( | ||
f"{framework}, {quantization}: {stats['mean']:.2f} ± {stats['std']:.2f}", | ||
file=file, | ||
) | ||
runner_dict = { | ||
"cuda": [ | ||
{ | ||
"precision": "int4", | ||
"model_path": os.path.join(model_folder, model_name + "4.0-bit"), | ||
}, | ||
{ | ||
"precision": "int8", | ||
"model_path": os.path.join(model_folder, model_name + "8.0-bit"), | ||
}, | ||
] | ||
} | ||
|
||
make_report( | ||
args=args, | ||
benchmark_class=ExLlamaV2Benchmark, | ||
runner_dict=runner_dict, | ||
benchmark_name="ExLlamaV2", | ||
is_bench_pytorch=False, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.