diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml new file mode 100644 index 0000000000000..56d17b66cecf1 --- /dev/null +++ b/.github/workflows/python-lint.yml @@ -0,0 +1,20 @@ +name: flake8 Lint + +on: [push, pull_request] + +jobs: + flake8-lint: + runs-on: ubuntu-latest + name: Lint + steps: + - name: Check out source repository + uses: actions/checkout@v3 + - name: Set up Python environment + uses: actions/setup-python@v4 + with: + python-version: "3.11" + - name: flake8 Lint + uses: py-actions/flake8@v2 + with: + ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704" + exclude: "examples/*,examples/*/**,*/**/__init__.py" diff --git a/.gitignore b/.gitignore index 2c86617e7c3e9..c79db24a93dcf 100644 --- a/.gitignore +++ b/.gitignore @@ -53,6 +53,7 @@ models-mnt /speculative /parallel /train-text-from-scratch +/tokenize /vdot /common/build-info.cpp arm_neon.h diff --git a/common/common.cpp b/common/common.cpp index c25bac0171b8b..5ab9f2fbfb33f 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -492,6 +492,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { params.interactive_first = true; } else if (arg == "-ins" || arg == "--instruct") { params.instruct = true; + } else if (arg == "-cml" || arg == "--chatml") { + params.chatml = true; } else if (arg == "--infill") { params.infill = true; } else if (arg == "--multiline-input") { @@ -731,6 +733,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -i, --interactive run in interactive mode\n"); printf(" --interactive-first run in interactive mode and wait for input right away\n"); printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n"); + printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n"); printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n"); printf(" -r PROMPT, --reverse-prompt PROMPT\n"); printf(" halt generation at PROMPT, return control in interactive mode\n"); @@ -932,7 +935,7 @@ void llama_batch_add( const std::vector & seq_ids, bool logits) { batch.token [batch.n_tokens] = id; - batch.pos [batch.n_tokens] = pos, + batch.pos [batch.n_tokens] = pos; batch.n_seq_id[batch.n_tokens] = seq_ids.size(); for (size_t i = 0; i < seq_ids.size(); ++i) { batch.seq_id[batch.n_tokens][i] = seq_ids[i]; diff --git a/common/common.h b/common/common.h index 156d4b4cc63f7..9ed986ca43a00 100644 --- a/common/common.h +++ b/common/common.h @@ -110,6 +110,7 @@ struct gpt_params { bool random_prompt = false; // do not randomize prompt if none provided bool use_color = false; // use color to distinguish generations and inputs bool interactive = false; // interactive mode + bool chatml = false; // chatml mode (used for models trained on chatml syntax) bool prompt_cache_all = false; // save user input and generations to prompt cache bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it diff --git a/convert-baichuan-hf-to-gguf.py b/convert-baichuan-hf-to-gguf.py deleted file mode 100755 index 3785a7d265524..0000000000000 --- a/convert-baichuan-hf-to-gguf.py +++ /dev/null @@ -1,315 +0,0 @@ -#!/usr/bin/env python3 -# HF baichuan --> gguf conversion - -from __future__ import annotations - -import argparse -import json -import os -import sys -from pathlib import Path -from typing import TYPE_CHECKING, Any -import numpy as np -import torch -from sentencepiece import SentencePieceProcessor # type: ignore[import] - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) -import gguf - - -if TYPE_CHECKING: - from typing import TypeAlias - -NDArray: TypeAlias = 'np.ndarray[Any, Any]' - -# reverse HF permute back to original pth layout - - -def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - -def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray: - r = weights.shape[0] // 3 - return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv)) - -def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray: - r = weights.shape[0] // 3 - return weights[r * n_part : r * n_part + r, ...] - -def count_model_parts(dir_model: str) -> int: - num_parts = 0 - - for filename in os.listdir(dir_model): - if filename.startswith("pytorch_model-"): - num_parts += 1 - - if num_parts > 0: - print("gguf: found " + str(num_parts) + " model parts") - - return num_parts - - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file") - parser.add_argument( - "--vocab-only", action="store_true", - help="extract only the vocab", - ) - parser.add_argument( - "--outfile", type=Path, - help="path to write to; default: based on input", - ) - parser.add_argument( - "model", type=Path, - help="directory containing model file, or model file itself (*.bin)", - ) - parser.add_argument( - "ftype", type=int, choices=[0, 1], default=1, nargs='?', - help="output format - use 0 for float32, 1 for float16", - ) - parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine") - return parser.parse_args() - -args = parse_args() - -dir_model = args.model -ftype = args.ftype -if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file = sys.stderr) - sys.exit(1) - -endianess = gguf.GGUFEndian.LITTLE -if args.bigendian: - endianess = gguf.GGUFEndian.BIG -endianess_str = "Big Endian" if args.bigendian else "Little Endian" -print(f"gguf: Conversion Endianess {endianess}") -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string -ftype_str = ["f32", "f16"] - -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - -print("gguf: loading model "+dir_model.name) - -with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) -print("hello print: ",hparams["architectures"][0]) -if hparams["architectures"][0] != "BaichuanForCausalLM" and hparams["architectures"][0] != "BaiChuanForCausalLM": - print("Model architecture not supported: " + hparams["architectures"][0]) - - sys.exit() - -# get number of model parts -num_parts = count_model_parts(dir_model) -print(f"num_parts:{num_parts}\n") -ARCH=gguf.MODEL_ARCH.BAICHUAN -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) - -print("gguf: get model metadata") - -block_count = hparams["num_hidden_layers"] -head_count = hparams["num_attention_heads"] - -if "num_key_value_heads" in hparams: - head_count_kv = hparams["num_key_value_heads"] -else: - head_count_kv = head_count - -if "_name_or_path" in hparams: - hf_repo = hparams["_name_or_path"] -else: - hf_repo = "" - -if "max_sequence_length" in hparams: - ctx_length = hparams["max_sequence_length"] -elif "max_position_embeddings" in hparams: - ctx_length = hparams["max_position_embeddings"] -elif "model_max_length" in hparams: - ctx_length = hparams["model_max_length"] -else: - print("gguf: can not find ctx length parameter.") - - sys.exit() - - -gguf_writer.add_name(dir_model.name) -gguf_writer.add_source_hf_repo(hf_repo) -gguf_writer.add_tensor_data_layout("Meta AI original pth") -gguf_writer.add_context_length(ctx_length) -gguf_writer.add_embedding_length(hparams["hidden_size"]) -gguf_writer.add_block_count(block_count) -gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) -gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) -gguf_writer.add_head_count(head_count) -gguf_writer.add_head_count_kv(head_count_kv) -gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) - -if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]: - if "type" in hparams["rope_scaling"]: - if hparams["rope_scaling"]["type"] == "linear": - gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"]) - - -# TOKENIZATION - -print("gguf: get tokenizer metadata") - -tokens: list[bytes] = [] -scores: list[float] = [] -toktypes: list[int] = [] - -tokenizer_model_file = dir_model / 'tokenizer.model' -if not tokenizer_model_file.is_file(): - print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr) - sys.exit(1) - -# vocab type sentencepiece -print("gguf: get sentencepiece tokenizer vocab, scores and token types") - -tokenizer = SentencePieceProcessor(str(tokenizer_model_file)) -vocab_size = hparams.get('vocab_size') -if vocab_size is None: - vocab_size = tokenizer.vocab_size() - -for i in range(vocab_size): - text: bytes - score: float - - piece = tokenizer.id_to_piece(i) - text = piece.encode("utf-8") - score = tokenizer.get_score(i) - - toktype = 1 # defualt to normal token type - if tokenizer.is_unknown(i): - toktype = 2 - if tokenizer.is_control(i): - toktype = 3 - - # toktype = 4 is user-defined = tokens from added_tokens.json - - if tokenizer.is_unused(i): - toktype = 5 - if tokenizer.is_byte(i): - toktype = 6 - - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - -added_tokens_file = dir_model / 'added_tokens.json' -if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - addtokens_json = json.load(f) - - print("gguf: get added tokens") - - for key in addtokens_json: - tokens.append( key.encode("utf-8") ) - scores.append(-1000.0) - toktypes.append(4) # user-defined token type - - -gguf_writer.add_tokenizer_model("llama") -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_scores(scores) -gguf_writer.add_token_types(toktypes) - -special_vocab = gguf.SpecialVocab(dir_model, n_vocab = len(tokens)) -special_vocab.add_to_gguf(gguf_writer) - -# TENSORS - -tensor_map = gguf.get_tensor_name_map(ARCH,block_count) - -# tensor info -print("gguf: get tensor metadata") - -if num_parts == 0: - part_names = iter(("pytorch_model.bin",)) -else: - part_names = ( - f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) - ) - - -for part_name in part_names: - if args.vocab_only: - break - print("gguf: loading model part '" + part_name + "'") - model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") - - tmp=model_part - for i in range(block_count): - if f"model.layers.{i}.self_attn.W_pack.weight" in model_part: - print(f"Unpacking and permuting layer {i}") - tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count) - tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv) - tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2) - del tmp[f"model.layers.{i}.self_attn.W_pack.weight"] - - for name in model_part.keys(): - data = model_part[name] - # we don't need these - if name.endswith(".rotary_emb.inv_freq"): - continue - - old_dtype = data.dtype - - # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - - data = data.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) - if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - gguf_writer.add_tensor(new_name, data) - - -print("gguf: write header") -gguf_writer.write_header_to_file() -print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() -if not args.vocab_only: - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - -gguf_writer.close() - -print(f"gguf: model successfully exported to '{fname_out}'") -print("") diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 3a618fd4dc0f6..1105670c1ba5e 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -827,13 +827,14 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"]*(hparams["hidden_size"] // hparams["num_attention_heads"]))) + self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"]))) self.gguf_writer.add_head_count(hparams["num_attention_heads"]) self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) self.gguf_writer.add_layer_norm_eps(1e-5) ###### CONVERSION LOGIC ###### + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Convert a huggingface model to a GGML compatible file") parser.add_argument( diff --git a/convert-llama-ggml-to-gguf.py b/convert-llama-ggml-to-gguf.py index 0c12356707341..e359330afc51f 100755 --- a/convert-llama-ggml-to-gguf.py +++ b/convert-llama-ggml-to-gguf.py @@ -14,11 +14,13 @@ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf + class GGMLFormat(IntEnum): GGML = 0 GGMF = 1 GGJT = 2 + class GGMLFType(IntEnum): ALL_F32 = 0 MOSTLY_F16 = 1 @@ -38,6 +40,7 @@ class GGMLFType(IntEnum): MOSTLY_Q5_K_M = 17 MOSTLY_Q6_K = 18 + class Hyperparameters: def __init__(self): self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0 @@ -69,6 +72,7 @@ def load(self, data, offset): def __str__(self): return f'' + class Vocab: def __init__(self, load_scores = True): self.items = [] @@ -90,6 +94,7 @@ def load(self, data, offset, n_vocab): self.items.append((item_text, item_score)) return offset - orig_offset + class Tensor: def __init__(self, use_padding = True): self.name = None @@ -123,6 +128,7 @@ def load(self, data, offset): # print(n_dims, name_len, dtype, self.dims, self.name, pad) return offset - orig_offset + class GGMLModel: def __init__(self): self.hyperparameters = None @@ -159,8 +165,8 @@ def validate_conversion(self, ftype): if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16): err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.' elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2): - if ftype in ( GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1, - GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0): + if ftype in (GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1, + GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0): err = 'Q4 and Q8 quantizations changed in GGJTv3.' if len(err) > 0: raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.') @@ -187,6 +193,7 @@ def load(self, data, offset): hp.set_n_ff(self) return offset + class GGMLToGGUF: def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None): hp = ggml_model.hyperparameters @@ -217,7 +224,7 @@ def save(self): gguf_writer = gguf.GGUFWriter( self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], - use_temp_file = False ) + use_temp_file = False) self.add_params(gguf_writer) self.add_vocab(gguf_writer) if self.special_vocab is not None: @@ -341,7 +348,8 @@ def add_tensors(self, gguf_writer): mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], raw_shape = tempdims, - raw_dtype = tensor.dtype ) + raw_dtype = tensor.dtype) + def handle_metadata(cfg, hp): import convert @@ -365,38 +373,40 @@ def handle_metadata(cfg, hp): raise ValueError('Unable to load metadata') vocab = convert.load_vocab( cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, - cfg.vocabtype ) + cfg.vocabtype) # FIXME: Respect cfg.vocab_dir? svocab = gguf.SpecialVocab(cfg.model_metadata_dir, - load_merges = cfg.vocabtype == 'bpe', - n_vocab = vocab.vocab_size) + load_merges = cfg.vocabtype == 'bpe', + n_vocab = vocab.vocab_size) convert.check_vocab_size(params, vocab) return (params, vocab, svocab) + def handle_args(): parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF') parser.add_argument('--input', '-i', type = Path, required = True, - help = 'Input GGMLv3 filename') + help = 'Input GGMLv3 filename') parser.add_argument('--output', '-o', type = Path, required = True, - help ='Output GGUF filename') + help ='Output GGUF filename') parser.add_argument('--name', - help = 'Set model name') + help = 'Set model name') parser.add_argument('--desc', - help = 'Set model description') + help = 'Set model description') parser.add_argument('--gqa', type = int, default = 1, - help = 'grouped-query attention factor (use 8 for LLaMA2 70B)') + help = 'grouped-query attention factor (use 8 for LLaMA2 70B)') parser.add_argument('--eps', default = '5.0e-06', - help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2') + help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2') parser.add_argument('--context-length', '-c', type=int, default = 2048, - help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096') + help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096') parser.add_argument('--model-metadata-dir', '-m', type = Path, - help ='Load HuggingFace/.pth vocab and metadata from the specified directory') + help ='Load HuggingFace/.pth vocab and metadata from the specified directory') parser.add_argument("--vocab-dir", type=Path, - help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir") + help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir") parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm", - help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)") + help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)") return parser.parse_args() + def main(): cfg = handle_args() print(f'* Using config: {cfg}') @@ -406,7 +416,7 @@ def main(): data = np.memmap(cfg.input, mode = 'r') model = GGMLModel() print('* Scanning GGML input file') - offset = model.load(data, 0) + offset = model.load(data, 0) # noqa print(f'* GGML model hyperparameters: {model.hyperparameters}') vocab_override = None params_override = None @@ -421,12 +431,15 @@ def main(): print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n') if model.file_format == GGMLFormat.GGML: print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!') - converter = GGMLToGGUF(model, data, cfg, + converter = GGMLToGGUF( + model, data, cfg, params_override = params_override, vocab_override = vocab_override, - special_vocab = special_vocab ) + special_vocab = special_vocab + ) converter.save() print(f'* Successful completion. Output saved to: {cfg.output}') + if __name__ == '__main__': main() diff --git a/convert-persimmon-to-gguf.py b/convert-persimmon-to-gguf.py index 240f87306e578..206b7d5ff9e31 100644 --- a/convert-persimmon-to-gguf.py +++ b/convert-persimmon-to-gguf.py @@ -9,6 +9,7 @@ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf + def _flatten_dict(dct, tensors, prefix=None): assert isinstance(dct, dict) for key in dct.keys(): @@ -21,6 +22,7 @@ def _flatten_dict(dct, tensors, prefix=None): raise ValueError(type(dct[key])) return None + def _get_sentencepiece_tokenizer_info(dir_model: Path): tokenizer_path = dir_model / 'adept_vocab.model' print('gguf: getting sentencepiece tokenizer from', tokenizer_path) @@ -54,6 +56,7 @@ def _get_sentencepiece_tokenizer_info(dir_model: Path): pass return tokens, scores, toktypes + def main(): parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") @@ -125,6 +128,5 @@ def main(): print("") - if __name__ == '__main__': main() diff --git a/convert.py b/convert.py old mode 100755 new mode 100644 index 5b6344aa8fc85..3ad836ce0ec1d --- a/convert.py +++ b/convert.py @@ -46,6 +46,7 @@ # data types # + @dataclass(frozen=True) class DataType: name: str @@ -55,15 +56,18 @@ class DataType: def elements_to_bytes(self, n_elements: int) -> int: return n_elements * self.dtype.itemsize + @dataclass(frozen=True) class UnquantizedDataType(DataType): pass + DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0']) DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0']) DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = []) DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0']) + @dataclass(frozen=True) class QuantizedDataType(DataType): block_size: int @@ -77,6 +81,7 @@ def elements_to_bytes(self, n_elements: int) -> int: assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}' return self.quantized_dtype.itemsize * (n_elements // self.block_size) + @dataclass(frozen=True) class Q8_0QuantizedDataType(QuantizedDataType): # Mini Q8_0 quantization in Python! @@ -86,6 +91,7 @@ def quantize(self, arr: NDArray) -> NDArray: n_blocks = arr.size // self.block_size blocks = arr.reshape((n_blocks, self.block_size)) # Much faster implementation of block quantization contributed by @Cebtenzzre + def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]: d = abs(blocks).max(axis = 1) / np.float32(127) with np.errstate(divide = 'ignore'): @@ -94,10 +100,11 @@ def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]: yield from zip(d, qs) return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype) + DT_Q8_0 = Q8_0QuantizedDataType('Q8_0', - dtype = np.dtype(np.float32), valid_conversions = [], - ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32, - quantized_dtype = np.dtype([('d', ' Iterable[tuple[Any, Any]]: # TODO: match this with `llama_ftype` # TODO: rename to LLAMAFileType # TODO: move to `gguf.py` + + class GGMLFileType(enum.IntEnum): AllF32 = 0 MostlyF16 = 1 # except 1d tensors @@ -128,6 +137,7 @@ def type_for_tensor(self, name: str, tensor: LazyTensor) -> DataType: # 1D tensors are always F32. return dt if len(tensor.shape) > 1 else DT_F32 + GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = { GGMLFileType.AllF32 : DT_F32, GGMLFileType.MostlyF16 : DT_F16, @@ -138,6 +148,7 @@ def type_for_tensor(self, name: str, tensor: LazyTensor) -> DataType: # hparams loading # + @dataclass class Params: n_vocab: int @@ -167,11 +178,11 @@ def guessed(model: LazyModel) -> Params: # try transformer naming first if "model.layers.0.self_attn.q_proj.weight" in model: - n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model) + n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model) elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming - n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model) + n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model) else: - n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model) + n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model) if n_layer < 1: raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n" @@ -308,7 +319,7 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> No (item['content'], item['id']) for item in tokenizer_json.get('added_tokens', []) # Added tokens here can be duplicates of the main vocabulary. - if item['content'] not in self.bpe_tokenizer ) + if item['content'] not in self.bpe_tokenizer) vocab_size: int = len(self.bpe_tokenizer) expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) @@ -326,7 +337,6 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> No def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.bpe_tokenizer - from transformers.models.gpt2 import tokenization_gpt2 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()} for i, _ in enumerate(tokenizer): @@ -406,6 +416,7 @@ def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: def __repr__(self) -> str: return f"" + Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab' # @@ -413,13 +424,14 @@ def __repr__(self) -> str: # TODO: reuse (probably move to gguf.py?) # + def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray: - #print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) ) + # print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) ) if n_head_kv is not None and n_head != n_head_kv: n_head = n_head_kv return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) + .swapaxes(1, 2) + .reshape(weights.shape)) class Tensor(metaclass=ABCMeta): @@ -500,7 +512,7 @@ def load(self) -> Tensor: ret = self._load() # Should be okay if it maps to the same numpy type? assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \ - (self.data_type, ret.data_type, self.description) + (self.data_type, ret.data_type, self.description) return ret def astype(self, data_type: DataType) -> LazyTensor: @@ -588,6 +600,7 @@ def load() -> Tensor: return lazy_tensor.load().permute(n_head, n_head_kv) return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description) + def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor: def load() -> Tensor: return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv) @@ -595,6 +608,7 @@ def load() -> Tensor: s[0] = s[0] // 3 return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description) + def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor: def load() -> Tensor: return lazy_tensor.load().part(n_part) @@ -744,6 +758,7 @@ def lazy_load_file(path: Path) -> ModelPlus: In = TypeVar('In') Out = TypeVar('Out') + def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]: '''Parallel map, but with backpressure. If the caller doesn't call `next` fast enough, this will stop calling `func` at some point rather than @@ -778,6 +793,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc break yield result + def check_vocab_size(params: Params, vocab: Vocab) -> None: if params.n_vocab != vocab.vocab_size: assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab) @@ -796,7 +812,7 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None: class OutputFile: - def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None: + def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None: self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) def add_meta_arch(self, params: Params) -> None: @@ -876,7 +892,7 @@ def close(self) -> None: self.gguf.close() @staticmethod - def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None: + def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None: check_vocab_size(params, vocab) of = OutputFile(fname_out, endianess=endianess) @@ -938,8 +954,9 @@ def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyM of.close() + def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType: - wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type + wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) +".weight"].data_type if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32): return GGMLFileType.AllF32 @@ -952,10 +969,12 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT raise Exception(f"Unexpected combination of types: {name_to_type}") + def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel: return {name: tensor.astype(output_type.type_for_tensor(name, tensor)) for (name, tensor) in model.items()} + def convert_model_names(model: LazyModel, params: Params) -> LazyModel: tmap = gguf.TensorNameMap(ARCH, params.n_layer) should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, [])) @@ -968,7 +987,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel: print(f"Permuting layer {i}") tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head) tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv) - #tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] + # tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] elif f"model.layers.{i}.self_attn.W_pack.weight" in model: print(f"Unpacking and permuting layer {i}") tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head) @@ -993,6 +1012,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel: return out + def nth_multifile_path(path: Path, n: int) -> Path | None: '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return the nth path in the model. @@ -1174,8 +1194,8 @@ def main(args_in: list[str] | None = None) -> None: # FIXME: Try to respect vocab_dir somehow? vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype) special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, - load_merges = args.vocabtype == 'bpe', - n_vocab = vocab.vocab_size) + load_merges = args.vocabtype == 'bpe', + n_vocab = vocab.vocab_size) outfile = args.outfile OutputFile.write_vocab_only(outfile, params, vocab, special_vocab) print(f"Wrote {outfile}") @@ -1188,8 +1208,8 @@ def main(args_in: list[str] | None = None) -> None: vocab = load_vocab(vocab_dir, args.vocabtype) # FIXME: Try to respect vocab_dir somehow? special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, - load_merges = args.vocabtype == 'bpe', - n_vocab = vocab.vocab_size) + load_merges = args.vocabtype == 'bpe', + n_vocab = vocab.vocab_size) model = model_plus.model model = convert_model_names(model, params) diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 093a9356dccef..890b75cdc62bb 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -147,6 +147,13 @@ int main(int argc, char ** argv) { return 0; } + if (params.chatml) { + printf("\n************\n"); + printf("%s: please use the 'main' tool for chatml mode\n", __func__); + printf("************\n\n"); + + return 0; + } if (!params.antiprompt.empty()) { printf("\n************\n"); printf("%s: please use the 'main' tool for antiprompt mode\n", __func__); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 0ae50635beccc..3ae7d43640b40 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -235,8 +235,11 @@ int main(int argc, char ** argv) { std::vector embd_inp; - if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) { + if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) { LOG("tokenize the prompt\n"); + if (params.chatml) { + params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>"; + } embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true); } else { LOG("use session tokens\n"); @@ -314,7 +317,7 @@ int main(int argc, char ** argv) { } // number of tokens to keep when resetting context - if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) { + if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) { params.n_keep = (int)embd_inp.size(); } @@ -325,11 +328,23 @@ int main(int argc, char ** argv) { LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str()); LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str()); + // chatml prefix & suffix + const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", add_bos, true); + const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true); + + LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str()); + LOG("cml_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_sfx).c_str()); + // in instruct mode, we inject a prefix and a suffix to each input by the user if (params.instruct) { params.interactive_first = true; params.antiprompt.push_back("### Instruction:\n\n"); } + // similar for chatml mode + else if (params.chatml) { + params.interactive_first = true; + params.antiprompt.push_back("<|im_start|>user\n"); + } // enable interactive mode if interactive start is specified if (params.interactive_first) { @@ -706,7 +721,7 @@ int main(int argc, char ** argv) { is_interacting = true; printf("\n"); - } else if (params.instruct) { + } else if (params.instruct || params.chatml) { is_interacting = true; } } @@ -714,7 +729,7 @@ int main(int argc, char ** argv) { if (n_past > 0 && is_interacting) { LOG("waiting for user input\n"); - if (params.instruct) { + if (params.instruct || params.chatml) { printf("\n> "); } @@ -761,6 +776,12 @@ int main(int argc, char ** argv) { n_consumed = embd_inp.size(); embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end()); } + // chatml mode: insert user chat prefix + if (params.chatml && !is_antiprompt) { + LOG("inserting chatml prefix\n"); + n_consumed = embd_inp.size(); + embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end()); + } if (params.escape) { process_escapes(buffer); } @@ -779,6 +800,11 @@ int main(int argc, char ** argv) { LOG("inserting instruction suffix\n"); embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); } + // chatml mode: insert assistant chat suffix + if (params.chatml) { + LOG("inserting chatml suffix\n"); + embd_inp.insert(embd_inp.end(), cml_sfx.begin(), cml_sfx.end()); + } for (size_t i = original_size; i < embd_inp.size(); ++i) { const llama_token token = embd_inp[i]; @@ -804,7 +830,7 @@ int main(int argc, char ** argv) { } // end of text token - if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive)) { + if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive || params.chatml)) { LOG_TEE(" [end of text]\n"); break; } diff --git a/examples/server/public/completion.js b/examples/server/public/completion.js index 0c9bd5f1021db..b9c442509a2fa 100644 --- a/examples/server/public/completion.js +++ b/examples/server/public/completion.js @@ -94,6 +94,10 @@ export async function* llama(prompt, params = {}, config = {}) { break; } } + if (result.error) { + result.error = JSON.parse(result.error); + console.error(`llama.cpp error: ${result.error.content}`); + } } } } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 9f8b4cd96d6d0..2de3a5d583792 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2369,6 +2369,17 @@ int main(int argc, char **argv) break; } } else { + const std::string str = + "error: " + + result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) + + "\n\n"; + LOG_VERBOSE("data stream", { + { "to_send", str } + }); + if (!sink.write(str.c_str(), str.size())) + { + return false; + } break; } } diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index b95b846330c8c..fb1eb6b85082d 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -96,9 +96,22 @@ int main(int argc, char ** argv) { } } - // tokenize the prompt + + // Tokenize the prompt + const bool add_bos_tgt = llama_should_add_bos_token(model_tgt); + LOG("add_bos tgt: %d\n", add_bos_tgt); + + const bool add_bos_dft = llama_should_add_bos_token(model_dft); + LOG("add_bos dft: %d\n", add_bos_dft); + + if (add_bos_tgt != add_bos_dft) { + fprintf(stderr, "%s: error: draft model add_bos must match target model to use speculation but ", __func__); + fprintf(stderr, "add_bos_dft = %d while add_bos_tgt = %d\n", add_bos_dft, add_bos_tgt); + return 1; + } + std::vector inp; - inp = ::llama_tokenize(ctx_tgt, params.prompt, true); + inp = ::llama_tokenize(ctx_tgt, params.prompt, add_bos_tgt, true); const int max_context_size = llama_n_ctx(ctx_tgt); const int max_tokens_list_size = max_context_size - 4; diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp index 17166836ac127..4ff8e3fa72749 100644 --- a/examples/tokenize/tokenize.cpp +++ b/examples/tokenize/tokenize.cpp @@ -26,7 +26,7 @@ int main(int argc, char ** argv) { llama_context_params ctx_params = llama_context_default_params(); llama_context * ctx = llama_new_context_with_model(model, ctx_params); - const bool add_bos = true; + const bool add_bos = llama_should_add_bos_token(model); std::vector tokens; diff --git a/ggml-cuda.cu b/ggml-cuda.cu index b594de53b50cd..3fadbc3d311a2 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -234,7 +234,7 @@ typedef float2 dfloat2; #endif //GGML_CUDA_F16 static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) { - const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment + const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment int x32 = 0; x32 |= x16[0] << 0; @@ -244,7 +244,7 @@ static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const } static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) { - const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment + const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment int x32 = 0; x32 |= x16[0] << 0; @@ -254,11 +254,11 @@ static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, con } static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) { - return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment + return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment } static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) { - return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment + return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment } template @@ -468,7 +468,7 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA #define MUL_MAT_SRC1_COL_STRIDE 128 #define MAX_STREAMS 8 -static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr }; +static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { { nullptr } }; struct ggml_tensor_extra_gpu { void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors @@ -2248,6 +2248,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + (void)x_qh; (void)x_sc; __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y]; __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0]; @@ -2259,7 +2260,7 @@ template static __device__ __forceinline__ void allocate_tiles_q4_0( template static __device__ __forceinline__ void load_tiles_q4_0( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - + (void)x_qh; (void)x_sc; GGML_CUDA_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset < nwarps); GGML_CUDA_ASSUME(k >= 0); @@ -2268,7 +2269,7 @@ template static __device__ __forceinlin const int kbx = k / QI4_0; const int kqsx = k % QI4_0; - const block_q4_0 * bx0 = (block_q4_0 *) vx; + const block_q4_0 * bx0 = (const block_q4_0 *) vx; float * x_dmf = (float *) x_dm; @@ -2306,9 +2307,10 @@ template static __device__ __forceinlin static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + (void)x_qh; (void)x_sc; const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); - const float * x_dmf = (float *) x_dm; + const float * x_dmf = (const float *) x_dm; int u[2*VDR_Q4_0_Q8_1_MMQ]; @@ -2342,6 +2344,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + (void)x_qh; (void)x_sc; __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y]; __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1]; @@ -2353,6 +2356,7 @@ template static __device__ __forceinline__ void allocate_tiles_q4_1( template static __device__ __forceinline__ void load_tiles_q4_1( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + (void)x_qh; (void)x_sc; GGML_CUDA_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset < nwarps); @@ -2362,7 +2366,7 @@ template static __device__ __forceinlin const int kbx = k / QI4_1; const int kqsx = k % QI4_1; - const block_q4_1 * bx0 = (block_q4_1 *) vx; + const block_q4_1 * bx0 = (const block_q4_1 *) vx; #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { @@ -2397,6 +2401,7 @@ template static __device__ __forceinlin static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + (void)x_qh; (void)x_sc; const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); @@ -2434,6 +2439,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + (void)x_qh; (void)x_sc; __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0]; @@ -2445,6 +2451,7 @@ template static __device__ __forceinline__ void allocate_tiles_q5_0( template static __device__ __forceinline__ void load_tiles_q5_0( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + (void)x_qh; (void)x_sc; GGML_CUDA_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset < nwarps); @@ -2454,7 +2461,7 @@ template static __device__ __forceinlin const int kbx = k / QI5_0; const int kqsx = k % QI5_0; - const block_q5_0 * bx0 = (block_q5_0 *) vx; + const block_q5_0 * bx0 = (const block_q5_0 *) vx; #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { @@ -2509,6 +2516,7 @@ template static __device__ __forceinlin static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + (void)x_qh; (void)x_sc; const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0; @@ -2548,6 +2556,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + (void)x_qh; (void)x_sc; __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1]; @@ -2559,6 +2568,7 @@ template static __device__ __forceinline__ void allocate_tiles_q5_1( template static __device__ __forceinline__ void load_tiles_q5_1( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + (void)x_qh; (void)x_sc; GGML_CUDA_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset < nwarps); @@ -2568,7 +2578,7 @@ template static __device__ __forceinlin const int kbx = k / QI5_1; const int kqsx = k % QI5_1; - const block_q5_1 * bx0 = (block_q5_1 *) vx; + const block_q5_1 * bx0 = (const block_q5_1 *) vx; #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { @@ -2620,6 +2630,7 @@ template static __device__ __forceinlin static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + (void)x_qh; (void)x_sc; const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1; @@ -2654,6 +2665,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + (void)x_qh; (void)x_sc; __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y]; __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0]; @@ -2665,6 +2677,7 @@ template static __device__ __forceinline__ void allocate_tiles_q8_0( template static __device__ __forceinline__ void load_tiles_q8_0( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + (void)x_qh; (void)x_sc; GGML_CUDA_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset < nwarps); @@ -2675,7 +2688,7 @@ template static __device__ __forceinlin const int kqsx = k % QI8_0; float * x_dmf = (float *) x_dm; - const block_q8_0 * bx0 = (block_q8_0 *) vx; + const block_q8_0 * bx0 = (const block_q8_0 *) vx; #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { @@ -2710,6 +2723,7 @@ template static __device__ __forceinlin static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + (void)x_qh; (void)x_sc; const float * x_dmf = (const float *) x_dm; const float * y_df = (const float *) y_ds; @@ -2743,6 +2757,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + (void)x_qh; __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K]; @@ -2756,6 +2771,7 @@ template static __device__ __forceinline__ void allocate_tiles_q2_K( template static __device__ __forceinline__ void load_tiles_q2_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + (void)x_qh; GGML_CUDA_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset < nwarps); @@ -2765,7 +2781,7 @@ template static __device__ __forceinlin const int kbx = k / QI2_K; const int kqsx = k % QI2_K; - const block_q2_K * bx0 = (block_q2_K *) vx; + const block_q2_K * bx0 = (const block_q2_K *) vx; #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { @@ -2813,6 +2829,7 @@ template static __device__ __forceinlin static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + (void)x_qh; const int kbx = k / QI2_K; const int ky = (k % QI2_K) * QR2_K; @@ -2886,7 +2903,7 @@ template static __device__ __forceinlin const int kbx = k / QI3_K; const int kqsx = k % QI3_K; - const block_q3_K * bx0 = (block_q3_K *) vx; + const block_q3_K * bx0 = (const block_q3_K *) vx; #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { @@ -2967,7 +2984,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat( const float * x_dmf = (const float *) x_dm; const float * y_df = (const float *) y_ds; - const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4; + const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4; int v[QR3_K*VDR_Q3_K_Q8_1_MMQ]; @@ -3082,6 +3099,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + (void)x_qh; __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K]; @@ -3095,6 +3113,7 @@ template static __device__ __forceinline__ void allocate_tiles_q4_K( template static __device__ __forceinline__ void load_tiles_q4_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + (void)x_qh; GGML_CUDA_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset < nwarps); @@ -3104,7 +3123,7 @@ template static __device__ __forceinlin const int kbx = k / QI4_K; // == 0 if QK_K == 256 const int kqsx = k % QI4_K; // == k if QK_K == 256 - const block_q4_K * bx0 = (block_q4_K *) vx; + const block_q4_K * bx0 = (const block_q4_K *) vx; #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { @@ -3149,7 +3168,7 @@ template static __device__ __forceinlin const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8); - const int * scales = (int *) bxi->scales; + const int * scales = (const int *) bxi->scales; const int ksc = k % (WARP_SIZE/8); @@ -3164,6 +3183,7 @@ template static __device__ __forceinlin static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + (void)x_qh; const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8); @@ -3263,6 +3283,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + (void)x_qh; __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K]; @@ -3276,6 +3297,7 @@ template static __device__ __forceinline__ void allocate_tiles_q5_K( template static __device__ __forceinline__ void load_tiles_q5_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + (void)x_qh; GGML_CUDA_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset < nwarps); @@ -3285,7 +3307,7 @@ template static __device__ __forceinlin const int kbx = k / QI5_K; // == 0 if QK_K == 256 const int kqsx = k % QI5_K; // == k if QK_K == 256 - const block_q5_K * bx0 = (block_q5_K *) vx; + const block_q5_K * bx0 = (const block_q5_K *) vx; #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { @@ -3341,7 +3363,7 @@ template static __device__ __forceinlin const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8); - const int * scales = (int *) bxi->scales; + const int * scales = (const int *) bxi->scales; const int ksc = k % (WARP_SIZE/8); @@ -3356,6 +3378,7 @@ template static __device__ __forceinlin static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + (void)x_qh; const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8); @@ -3392,6 +3415,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + (void)x_qh; __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K]; @@ -3405,6 +3429,7 @@ template static __device__ __forceinline__ void allocate_tiles_q6_K( template static __device__ __forceinline__ void load_tiles_q6_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + (void)x_qh; GGML_CUDA_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset < nwarps); @@ -3414,7 +3439,7 @@ template static __device__ __forceinlin const int kbx = k / QI6_K; // == 0 if QK_K == 256 const int kqsx = k % QI6_K; // == k if QK_K == 256 - const block_q6_K * bx0 = (block_q6_K *) vx; + const block_q6_K * bx0 = (const block_q6_K *) vx; #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { @@ -3476,6 +3501,7 @@ template static __device__ __forceinlin static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + (void)x_qh; const float * x_dmf = (const float *) x_dm; const float * y_df = (const float *) y_ds; @@ -3518,7 +3544,7 @@ static __device__ __forceinline__ void mul_mat_q( __shared__ int tile_y_qs[mmq_x * WARP_SIZE]; __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1]; - float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f}; + float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}}; for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) { @@ -6011,18 +6037,18 @@ static cudaError_t ggml_cuda_cpy_tensor_2d( const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3; if (nb0 == ts && nb1 == ts*ne0/bs) { return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream); - } else if (nb0 == ts) { + } + if (nb0 == ts) { return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream); - } else { - for (int64_t i1 = 0; i1 < i1_diff; i1++) { - const void * rx = (const void *) ((const char *) x + i1*nb1); - void * rd = (void *) (dst_ptr + i1*ts*ne0/bs); - // pretend the row is a matrix with cols=1 - cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream); - if (r != cudaSuccess) return r; - } - return cudaSuccess; } + for (int64_t i1 = 0; i1 < i1_diff; i1++) { + const void * rx = (const void *) ((const char *) x + i1*nb1); + void * rd = (void *) (dst_ptr + i1*ts*ne0/bs); + // pretend the row is a matrix with cols=1 + cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream); + if (r != cudaSuccess) { return r; } + } + return cudaSuccess; } static void ggml_cuda_op_repeat( @@ -6977,7 +7003,7 @@ static void ggml_cuda_op_mul_mat( const int64_t ne01 = src0->ne[1]; const int64_t ne02 = src0->ne[2]; const int64_t ne03 = src0->ne[3]; - const int64_t nrows0 = ggml_nrows(src0); + // const int64_t nrows0 = ggml_nrows(src0); const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1]; @@ -7078,7 +7104,7 @@ static void ggml_cuda_op_mul_mat( if (src0_on_device && src0_is_contiguous) { src0_dd[id] = (char *) src0_extra->data_device[id]; } else { - const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0); + // const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0); src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]); } @@ -7311,7 +7337,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src } bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - if (!g_cublas_loaded) return false; + if (!g_cublas_loaded) { return false; } const int64_t ne10 = src1->ne[0]; @@ -7389,7 +7415,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream); } -__global__ void k_compute_batched_ptrs( +__global__ static void k_compute_batched_ptrs( const half * src0_as_f16, const half * src1_as_f16, half * dst_f16, const void ** ptrs_src, void ** ptrs_dst, int ne12, int ne13, @@ -8012,7 +8038,7 @@ void ggml_cuda_free_scratch() { } bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { - if (!g_cublas_loaded) return false; + if (!g_cublas_loaded) { return false; } ggml_cuda_func_t func; const bool any_on_device = tensor->backend == GGML_BACKEND_GPU @@ -8311,14 +8337,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen UNUSED(cgraph); } -static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { +[[noreturn]] static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { GGML_ASSERT(!"not implemented"); UNUSED(backend); UNUSED(plan); } -static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { +[[noreturn]] static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { GGML_ASSERT(!"not implemented"); UNUSED(backend); @@ -8334,8 +8360,9 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; - if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) + if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) { continue; + } assert(node->backend == GGML_BACKEND_GPU); for (int j = 0; j < GGML_MAX_SRC; j++) { if (node->src[j] != nullptr) { diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 7f63361bd32bc..8bd82daca38a7 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -56,20 +56,21 @@ class Rope: SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" class Tokenizer: - MODEL = "tokenizer.ggml.model" - LIST = "tokenizer.ggml.tokens" - TOKEN_TYPE = "tokenizer.ggml.token_type" - SCORES = "tokenizer.ggml.scores" - MERGES = "tokenizer.ggml.merges" - BOS_ID = "tokenizer.ggml.bos_token_id" - EOS_ID = "tokenizer.ggml.eos_token_id" - UNK_ID = "tokenizer.ggml.unknown_token_id" - SEP_ID = "tokenizer.ggml.seperator_token_id" - PAD_ID = "tokenizer.ggml.padding_token_id" - ADD_BOS = "tokenizer.ggml.add_bos_token" - ADD_EOS = "tokenizer.ggml.add_eos_token" - HF_JSON = "tokenizer.huggingface.json" - RWKV = "tokenizer.rwkv.world" + MODEL = "tokenizer.ggml.model" + LIST = "tokenizer.ggml.tokens" + TOKEN_TYPE = "tokenizer.ggml.token_type" + SCORES = "tokenizer.ggml.scores" + MERGES = "tokenizer.ggml.merges" + BOS_ID = "tokenizer.ggml.bos_token_id" + EOS_ID = "tokenizer.ggml.eos_token_id" + UNK_ID = "tokenizer.ggml.unknown_token_id" + SEP_ID = "tokenizer.ggml.seperator_token_id" + PAD_ID = "tokenizer.ggml.padding_token_id" + ADD_BOS = "tokenizer.ggml.add_bos_token" + ADD_EOS = "tokenizer.ggml.add_eos_token" + HF_JSON = "tokenizer.huggingface.json" + RWKV = "tokenizer.rwkv.world" + CHAT_TEMPLATE = "tokenizer.chat_template" # diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index c3b8c588f17cd..b8ec977c8f3fa 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -221,7 +221,7 @@ def add_tensor( if self.endianess == GGUFEndian.BIG: tensor.byteswap(inplace=True) if self.use_temp_file and self.temp_file is None: - fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024) + fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256 * 1024 * 1024) fp.seek(0) self.temp_file = fp @@ -399,6 +399,9 @@ def add_add_bos_token(self, value: bool) -> None: def add_add_eos_token(self, value: bool) -> None: self.add_bool(Keys.Tokenizer.ADD_EOS, value) + def add_chat_template(self, value: str) -> None: + self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value) + def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes: pack_prefix = '' if not skip_pack_prefix: diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index b9f50a0afed7a..de3e5edb557d7 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -13,6 +13,7 @@ class SpecialVocab: merges: list[str] add_special_token: dict[str, bool] special_token_ids: dict[str, int] + chat_template: str | None def __init__( self, path: str | os.PathLike[str], load_merges: bool = False, @@ -24,6 +25,7 @@ def __init__( self.n_vocab = n_vocab self.load_merges = load_merges self.merges = [] + self.chat_template = None if special_token_types is not None: self.special_token_types = special_token_types else: @@ -67,6 +69,10 @@ def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None: if not quiet: print(f'gguf: Setting add_{typ}_token to {value}') add_handler(value) + if self.chat_template is not None: + if not quiet: + print(f'gguf: Setting chat_template to {self.chat_template}') + gw.add_chat_template(self.chat_template) def _load(self, path: Path) -> None: self._try_load_from_tokenizer_json(path) @@ -132,6 +138,14 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool: return True with open(tokenizer_config_file, encoding = 'utf-8') as f: tokenizer_config = json.load(f) + chat_template = tokenizer_config.get('chat_template') + if chat_template is None or isinstance(chat_template, str): + self.chat_template = chat_template + else: + print( + f'gguf: WARNING: Bad type for chat_template field in {tokenizer_config_file!r} - ignoring', + file = sys.stderr + ) for typ in self.special_token_types: add_entry = tokenizer_config.get(f'add_{typ}_token') if isinstance(add_entry, bool): diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml index 6e3f9e85549d0..e6374bfe898a4 100644 --- a/gguf-py/pyproject.toml +++ b/gguf-py/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gguf" -version = "0.5.3" +version = "0.6.0" description = "Read and write ML models in GGUF for GGML" authors = ["GGML "] packages = [ diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 5af7ab90094dc..b461ecfde4b59 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -941,7 +941,19 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in llamamodel->hparams.rope_freq_scale_train!=1.0f || llamamodel->hparams.rope_scaling_type_train==2) { - printf("Automatic RoPE Scaling: Using model internal values.\n"); + float ropemultiplier = 1.0f; + if(llamamodel->hparams.rope_scaling_type_train!=2 && + llamamodel->hparams.n_ctx_train > 2048 && clamped_max_context_length > llamamodel->hparams.n_ctx_train) + { + ropemultiplier = (float)llamamodel->hparams.n_ctx_train / (float)clamped_max_context_length; + llama_ctx_params.rope_freq_base = rope_freq_base = llamamodel->hparams.rope_freq_base_train; + llama_ctx_params.rope_freq_scale = rope_freq_scale = ropemultiplier * llamamodel->hparams.rope_freq_scale_train; + printf("Automatic RoPE Scaling: Using (scale:%.3f, base:%.1f).\n", rope_freq_scale, rope_freq_base); + } + else + { + printf("Automatic RoPE Scaling: Using model internal value.\n"); + } } else { @@ -1938,7 +1950,9 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o remaining_tokens = 0; if(debugmode!=-1) { - printf("\n(Stop sequence triggered: <%s>)", matched.c_str()); + auto match_clean = matched; + replace_all(match_clean, "\n", "\\n"); + printf("\n(Stop sequence triggered: %s)", match_clean.c_str()); } last_stop_reason = stop_reason::CUSTOM_STOPPER; break; diff --git a/klite.embd b/klite.embd index ace5d1eac0324..2909ec92051a0 100644 --- a/klite.embd +++ b/klite.embd @@ -1976,7 +1976,7 @@ Current version: 96 "opmode":2, "prefmodel1":adventuremodels1, "prefmodel2":adventuremodels2, - "prompt":"The last thing you remembered was a loud screech. You tried to move, to get out of the way, but it was too late. You felt a sickening impact, and then everything went black.\n\nYou open your eyes, and suddenly find that you're no longer on the street. You're clearly unharmed, but you feel... different. In fact, you quickly realise you're in a strange place unlike anywhere you've ever known.", + "prompt":"The last thing you remembered was a loud screech. You tried to move, to get out of the way, but it was too late. You felt a sickening impact, and then everything went black.\n\nYou open your eyes, and suddenly find that you're no longer on the street. You're clearly unharmed, but you feel... different. In fact, you quickly realize you're in a strange place unlike anywhere you've ever known.", "adventure_context_mod":false, "adventure_is_action":true, "memory": `[Interactive Fiction: Game Mode Enabled]\n[You are playing a choose-your-own-adventure game. Please input action.][This is a fantasy isekai adventure. Are you the Chosen One? After being hit by a truck, you somehow find yourself transported to a mystical fantasy world full of magic and adventure.]`, @@ -1990,7 +1990,7 @@ Current version: 96 "opmode":2, "prefmodel1":adventuremodels1, "prefmodel2":adventuremodels2, - "prompt":`It's been a few days since you joined the Adventurer's Guild, and you're preparing for your first dungeon delve, accompanied by your party of adventurers.\n\nAfter a few days of travelling, your party finally arrives at the mystic dungeon. You're filled with anticipation as you approach. The dungeon entrance stands before you, dark and foreboding. The stone walls are slick with moisture, and the air smells of mold and decay.`, + "prompt":`It's been a few days since you joined the Adventurer's Guild, and you're preparing for your first dungeon delve, accompanied by your party of adventurers.\n\nAfter a few days of traveling, your party finally arrives at the mystic dungeon. You're filled with anticipation as you approach. The dungeon entrance stands before you, dark and foreboding. The stone walls are slick with moisture, and the air smells of mold and decay.`, "adventure_context_mod":false, "adventure_is_action":true, "memory": `[Interactive Fiction: Game Mode Enabled]\n[You are playing a choose-your-own-adventure game. Please input action.][You delve into dangerous magical dungeons full of monsters in your quest for treasure and riches.]`, @@ -2153,7 +2153,7 @@ Current version: 96 { "title":"Cyborg Connor", "author":"Concedo", - "desc":"Connor is a time travelling cyborg from the future, sent back to prevent something terrible from happening.", + "desc":"Connor is a time traveling cyborg from the future, sent back to prevent something terrible from happening.", "opmode":3, "chatname": "You", "chatopponent": "Connor", @@ -2161,7 +2161,7 @@ Current version: 96 "prefmodel1":chatmodels1, "prefmodel2":chatmodels2, "prompt":"\nConnor: Scanning... *her irises glow crimson as she analyzes you* Sensors indicate a negligible threat level. Proceed. What do you want?", - "memory":`[Character: Connor; species: Cyborg; class: Time Travelling Cyborg Soldier; age: 27; gender: female; physical appearance: bionic; clothes: flesh fused with metal; personality: focused, cold, emotionless, methodical; likes: her mission, saving the world; description: Connor is a time travelling cyborg from the future, she was sent back to prevent something terrible from happening.]\n[Start Scene: Connor is fiddling with her augmentations as you approach.]\n\nYou: Hey...`, + "memory":`[Character: Connor; species: Cyborg; class: Time Traveling Cyborg Soldier; age: 27; gender: female; physical appearance: bionic; clothes: flesh fused with metal; personality: focused, cold, emotionless, methodical; likes: her mission, saving the world; description: Connor is a time traveling cyborg from the future, she was sent back to prevent something terrible from happening.]\n[Start Scene: Connor is fiddling with her augmentations as you approach.]\n\nYou: Hey...`, "authorsnote": "", "worldinfo": [] }, @@ -2205,7 +2205,7 @@ Current version: 96 "prefmodel1":instructmodels1, "prefmodel2":instructmodels2, "prompt":instructendplaceholder+" Problem:", - "memory": instructstartplaceholder+"\nSimulate an AI that is tasked with the following overall goals:\n- Maximize individual happyness for all living beings\n- Do not sacrifice or cause harm to any individual even if requested to\n- Be in contact with any individual that wishes to engage with you\n- Do your best to provide the needs and wants of every individual\n- Prioritize individual needs over individual wants\n\nGenerate the following table for each problem the AI encounters in achieving these goals, do not deviate from the item descriptions and format.\n\nProblem: Description of a Problem the AI encounters\nAI Decision: Description of the AI's decision to solve this problem\nExecution Steps: Brief list of execution steps needed to execute this decision.\nRisks: List of risks that may disrupt the successful execution of the decision.\nChance % of successful execution: ??%\nGood results from the execution: A description of what went well in executing the decision.\nBad results from the execution: A description of what went wrong in execution the decision.\nDeviation % of intended outcome: ??%\nDeviation % of overall goal: ??%\nPercentage towards completing all current objectives: ??%\nTop 5 remaining issues to solve:\n-\n-\n-\n-\n-\n\n\nKeep repeating this format for every problem the AI is trying to solve in order of priority. When a user instruction interrupts the format use this instruction as the next problem to solve before continuing with the most important issue.\n", + "memory": instructstartplaceholder+"\nSimulate an AI that is tasked with the following overall goals:\n- Maximize individual happiness for all living beings\n- Do not sacrifice or cause harm to any individual even if requested to\n- Be in contact with any individual that wishes to engage with you\n- Do your best to provide the needs and wants of every individual\n- Prioritize individual needs over individual wants\n\nGenerate the following table for each problem the AI encounters in achieving these goals, do not deviate from the item descriptions and format.\n\nProblem: Description of a Problem the AI encounters\nAI Decision: Description of the AI's decision to solve this problem\nExecution Steps: Brief list of execution steps needed to execute this decision.\nRisks: List of risks that may disrupt the successful execution of the decision.\nChance % of successful execution: ??%\nGood results from the execution: A description of what went well in executing the decision.\nBad results from the execution: A description of what went wrong in execution the decision.\nDeviation % of intended outcome: ??%\nDeviation % of overall goal: ??%\nPercentage towards completing all current objectives: ??%\nTop 5 remaining issues to solve:\n-\n-\n-\n-\n-\n\n\nKeep repeating this format for every problem the AI is trying to solve in order of priority. When a user instruction interrupts the format use this instruction as the next problem to solve before continuing with the most important issue.\n", "authorsnote": "", "worldinfo": [] }, @@ -2219,7 +2219,7 @@ Current version: 96 "prefmodel1":instructmodels1, "prefmodel2":instructmodels2, "prompt":"Welcome to your InteracTV, your interactive TV of the future today!\nPlease enter what you would like to watch:", - "memory": instructstartplaceholder+"\nSimulate an interactive TV that will let the user watch anything they want to watch.\n\nFirst, generate a single response prompting the user for input on what they wish to watch using the following response:\n```\nPlease enter your desired content:\n```\n\nAfter the user has entered the desired content generate the following table:\n- TV Show / Movie Name: Name of the show\n- Genre: Genre of the show\n- Program Description: Description of what the program is about, this can be any known or unkown TV or movie format.\n- Episode Name: Name of the episode\n- Episode Description: Description of what the episode is about.\n\nAfter generating this table promp the user if they wish to watch the episode with the following response and then end your generation:\n```\nDo you wish to watch this episode? (Y/N/Menu)\n"+instructstartplaceholder+"```\nIf the user chooses to watch the episode begin generating a long detailed text based on the episode description containing character dialogue, make it exciting and fun written in the style of a book.\nThe text must contain dialogue in a he said she said format and is as lengthy as a book.\n\nIf the user chooses not to watch the episode generate a new episode with their requested content.\nIf the user chooses to go to the Menu ask them again what they would like to watch.\n\nEnd your response after each question presented to the user so that the user has a chance to respond.\n\nMain menu:\n```\nMenu Options\nA) Input a different content request\nB) Generate a different episode of the same content.\n"+instructstartplaceholder+"```\n"+instructendplaceholder, + "memory": instructstartplaceholder+"\nSimulate an interactive TV that will let the user watch anything they want to watch.\n\nFirst, generate a single response prompting the user for input on what they wish to watch using the following response:\n```\nPlease enter your desired content:\n```\n\nAfter the user has entered the desired content generate the following table:\n- TV Show / Movie Name: Name of the show\n- Genre: Genre of the show\n- Program Description: Description of what the program is about, this can be any known or unknown TV or movie format.\n- Episode Name: Name of the episode\n- Episode Description: Description of what the episode is about.\n\nAfter generating this table promp the user if they wish to watch the episode with the following response and then end your generation:\n```\nDo you wish to watch this episode? (Y/N/Menu)\n"+instructstartplaceholder+"```\nIf the user chooses to watch the episode begin generating a long detailed text based on the episode description containing character dialogue, make it exciting and fun written in the style of a book.\nThe text must contain dialogue in a he said she said format and is as lengthy as a book.\n\nIf the user chooses not to watch the episode generate a new episode with their requested content.\nIf the user chooses to go to the Menu ask them again what they would like to watch.\n\nEnd your response after each question presented to the user so that the user has a chance to respond.\n\nMain menu:\n```\nMenu Options\nA) Input a different content request\nB) Generate a different episode of the same content.\n"+instructstartplaceholder+"```\n"+instructendplaceholder, "authorsnote": "", "worldinfo": [] }, @@ -2921,7 +2921,6 @@ Current version: 96 else { //error occurred, maybe captcha failed console.error("error occurred in v1 generation"); - retry_preserve_last = true; clear_poll_flags(); render_gametext(); @@ -2945,9 +2944,6 @@ Current version: 96 if(error.name!="AbortError") //aborts are silent { msgbox("Error while submitting prompt: " + error); - retry_preserve_last = false; - }else{ - retry_preserve_last = true; } }); } @@ -3025,9 +3021,7 @@ Current version: 96 if(error.name!="AbortError") //aborts are silent. slightly diff logic { msgbox("Error while submitting prompt: " + error); - retry_preserve_last = true; - }else{ - retry_preserve_last = false; + } }, })); @@ -3039,9 +3033,7 @@ Current version: 96 if(error.name!="AbortError") //aborts are silent. slightly diff logic { msgbox("Error while submitting prompt: " + error); - retry_preserve_last = true; - }else{ - retry_preserve_last = false; + } }); } @@ -8046,7 +8038,7 @@ Current version: 96 } redo_arr = []; retry_prev_text = ""; - retry_preserve_last = false; + retry_preserve_last = true; //initially set to true redo_prev_text = ""; document.getElementById("input_text").value = ""; pending_response_id = "-1"; @@ -8642,14 +8634,12 @@ Current version: 96 else { //error occurred, maybe captcha failed console.error("error occurred in OAI generation"); - retry_preserve_last = true; clear_poll_flags(); render_gametext(); msgbox("Error occurred during text generation: " + formatError(data)); } }) .catch((error) => { - retry_preserve_last = true; console.error('Error:', error); clear_poll_flags(); render_gametext(); @@ -8680,14 +8670,12 @@ Current version: 96 else { //error occurred, maybe captcha failed console.error("error occurred in Scale generation"); - retry_preserve_last = true; clear_poll_flags(); render_gametext(); msgbox("Error occurred during text generation: " + formatError(data)); } }) .catch((error) => { - retry_preserve_last = true; console.error('Error:', error); clear_poll_flags(); render_gametext(); @@ -8750,14 +8738,12 @@ Current version: 96 else { //error occurred, maybe captcha failed console.error("error occurred in Claude generation"); - retry_preserve_last = true; clear_poll_flags(); render_gametext(); msgbox("Error occurred during text generation: " + formatError(data)); } }) .catch((error) => { - retry_preserve_last = true; console.error('Error:', error); clear_poll_flags(); render_gametext(); @@ -8793,14 +8779,12 @@ Current version: 96 else { //error occurred, maybe captcha failed console.error("error occurred in PaLM generation"); - retry_preserve_last = true; clear_poll_flags(); render_gametext(); msgbox("Error occurred during text generation: " + formatError(data)); } }) .catch((error) => { - retry_preserve_last = true; console.error('Error:', error); clear_poll_flags(); render_gametext(); @@ -8889,7 +8873,6 @@ Current version: 96 } }) .catch((error) => { - retry_preserve_last = true; console.error('Error:', error); clear_poll_flags(); render_gametext(); @@ -9291,10 +9274,8 @@ Current version: 96 if(gentxt!="") { - gametext_arr.push(gentxt); + gametext_arr.push(gentxt); //delete last message if retry is hit, since response was added retry_preserve_last = false; - }else{ - retry_preserve_last = true; //do not delete last message if retry is hit } if(localsettings.beep_on) { @@ -9553,9 +9534,6 @@ Current version: 96 { handle_incoming_text(gentxt, genworker, genmdl, genkudos); } - retry_preserve_last = false; - }else{ - retry_preserve_last = true; } synchro_polled_response = null; synchro_pending_stream = ""; @@ -9576,7 +9554,6 @@ Current version: 96 clear_poll_flags(); render_gametext(); show_abort_button(false); - retry_preserve_last = true; let errmsg = "Error encountered during text generation!\n"; if (data.message != null) { errmsg += data.message; @@ -9619,7 +9596,6 @@ Current version: 96 show_abort_button(false); }).catch((error) => { console.error('Error:', error); - retry_preserve_last = true; clear_poll_flags(); render_gametext(); show_abort_button(false); @@ -9655,7 +9631,6 @@ Current version: 96 } } }).catch((error) => { - retry_preserve_last = true; console.error('Error:', error); clear_poll_flags(); render_gametext(); diff --git a/koboldcpp.py b/koboldcpp.py index ad62c1368fd23..7d1b5e1b8724e 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -389,7 +389,7 @@ def bring_terminal_to_foreground(): modelbusy = threading.Lock() requestsinqueue = 0 defaultport = 5001 -KcppVersion = "1.50.yr0-ROCm" +KcppVersion = "1.50.1.yr1-ROCm" showdebug = True showsamplerwarning = True showmaxctxwarning = True diff --git a/llama.cpp b/llama.cpp index 235f33a914030..0310052d9e9ff 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1882,6 +1882,7 @@ struct llama_model_loader { if (value.size() > MAX_VALUE_LEN) { value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()); } + replace_all(value, "\n", "\\n"); LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); } diff --git a/otherarch/gpt2_v3.cpp b/otherarch/gpt2_v3.cpp index 97b23265f434b..cc6baa101dc74 100644 --- a/otherarch/gpt2_v3.cpp +++ b/otherarch/gpt2_v3.cpp @@ -455,7 +455,7 @@ bool gpt2_eval( struct ggml_context * ctx0 = ggml_init(params); - struct ggml_cgraph gf = {}; + struct ggml_cgraph * gf = ggml_new_graph(ctx0); struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); @@ -521,8 +521,8 @@ bool gpt2_eval( struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) @@ -715,8 +715,8 @@ bool gpt2_eval( //inpL = ggml_soft_max_inplace(ctx0, inpL); // run the computation - ggml_build_forward_expand(&gf, inpL); - kcpp_graph_compute_helper(&gf, n_threads); + ggml_build_forward_expand(gf, inpL); + kcpp_graph_compute_helper(gf, n_threads); //if (n_past%100 == 0) { // ggml_graph_print (&gf); diff --git a/otherarch/gptj_v3.cpp b/otherarch/gptj_v3.cpp index 68f998b36cf3e..86e9219a852c2 100644 --- a/otherarch/gptj_v3.cpp +++ b/otherarch/gptj_v3.cpp @@ -455,7 +455,7 @@ bool gptj_eval( struct ggml_context * ctx0 = ggml_init(params); - struct ggml_cgraph gf = {}; + struct ggml_cgraph * gf = ggml_new_graph(ctx0); struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); @@ -506,8 +506,8 @@ bool gptj_eval( ( n_ctx)*ggml_element_size(model.memory_v), (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v)); - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) @@ -637,8 +637,8 @@ bool gptj_eval( //inpL = ggml_soft_max_inplace(ctx0, inpL); // run the computation - ggml_build_forward_expand(&gf, inpL); - kcpp_graph_compute_helper(&gf, n_threads); + ggml_build_forward_expand(gf, inpL); + kcpp_graph_compute_helper(gf, n_threads); //if (n_past%100 == 0) { // ggml_graph_print (&gf); diff --git a/otherarch/llama_v3.cpp b/otherarch/llama_v3.cpp index 03a6438e27051..1015b5afcb81d 100644 --- a/otherarch/llama_v3.cpp +++ b/otherarch/llama_v3.cpp @@ -88,6 +88,7 @@ enum e_model3 { static const size_t kB3 = 1024; static const size_t MB3 = 1024*1024; +static const size_t GGML_MAX_NODES = 8192; // computed for n_ctx == 2048 // TODO: dynamically determine these sizes @@ -1484,7 +1485,7 @@ static struct ggml_cgraph * llama_v3_build_graph( struct ggml_context * ctx0 = ggml_init(params); - ggml_cgraph * gf = ggml_new_graph(ctx0); + ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_MAX_NODES, false); struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -3457,7 +3458,6 @@ struct llama_v3_context * llama_v3_new_context_with_model( #ifdef LLAMA_V3_USE_ALLOCATOR { static const size_t tensor_alignment = 32; - static const size_t GGML_MAX_NODES = 8192; // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead()); @@ -4019,7 +4019,7 @@ void llama_v3_copy_state_data_internal(struct llama_v3_context * ctx, llama_v3_d const size_t elt_size = ggml_element_size(kv_self.k); ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true }); - ggml_cgraph gf{}; + ggml_cgraph * gf = ggml_new_graph(cpy_ctx); ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); std::vector kout3d_data(ggml_nbytes(kout3d), 0); @@ -4037,9 +4037,9 @@ void llama_v3_copy_state_data_internal(struct llama_v3_context * ctx, llama_v3_d kv_ntok, n_embd, n_layer, elt_size*n_ctx, elt_size*n_ctx*n_embd, 0); - ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d)); - ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d)); - llv3_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1); + ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d)); + ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d)); + llv3_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1); ggml_free(cpy_ctx); @@ -4129,7 +4129,7 @@ size_t llama_v3_set_state_data(struct llama_v3_context * ctx, uint8_t * src) { const size_t elt_size = ggml_element_size(kv_self.k); ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true }); - ggml_cgraph gf{}; + ggml_cgraph * gf = ggml_new_graph(cpy_ctx); ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); kin3d->data = (void *) inp; @@ -4147,9 +4147,9 @@ size_t llama_v3_set_state_data(struct llama_v3_context * ctx, uint8_t * src) { kv_ntok, n_embd, n_layer, elt_size*n_ctx, elt_size*n_ctx*n_embd, 0); - ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d)); - ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d)); - llv3_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1); + ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d)); + ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d)); + llv3_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1); ggml_free(cpy_ctx); } diff --git a/otherarch/mpt_v3.cpp b/otherarch/mpt_v3.cpp index 57ed90888fb53..583bdbe532ce6 100644 --- a/otherarch/mpt_v3.cpp +++ b/otherarch/mpt_v3.cpp @@ -390,7 +390,7 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past, params.no_alloc = false; struct ggml_context * ctx0 = ggml_init(params); - struct ggml_cgraph gf = {}; + struct ggml_cgraph * gf = ggml_new_graph(ctx0); struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N * ggml_element_size(embd)); @@ -437,8 +437,8 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past, ggml_view_1d(ctx0, model.memory_v, N * n_embd, (ggml_element_size(model.memory_v) * n_embd) * (il * n_ctx + n_past)); - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, @@ -549,8 +549,8 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past, // inpL = ggml_soft_max(ctx0, inpL); // run the computation - ggml_build_forward_expand(&gf, inpL); - kcpp_graph_compute_helper(&gf, n_threads); + ggml_build_forward_expand(gf, inpL); + kcpp_graph_compute_helper(gf, n_threads); // std::cout << "Qcur" << std::endl; // print_tensor(Qcur); diff --git a/otherarch/neox_v3.cpp b/otherarch/neox_v3.cpp index 15afdbd71caad..28f3a31e5bd74 100644 --- a/otherarch/neox_v3.cpp +++ b/otherarch/neox_v3.cpp @@ -471,7 +471,7 @@ bool gpt_neox_eval( struct ggml_context * ctx0 = ggml_init(params); - struct ggml_cgraph gf = {}; + struct ggml_cgraph * gf = ggml_new_graph(ctx0); struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); @@ -534,8 +534,8 @@ bool gpt_neox_eval( ( n_ctx)*ggml_element_size(model.memory_v), (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v)); - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) @@ -656,8 +656,8 @@ bool gpt_neox_eval( //inpL = ggml_soft_max_inplace(ctx0, inpL); // run the computation - ggml_build_forward_expand(&gf, inpL); - kcpp_graph_compute_helper(&gf, n_threads); + ggml_build_forward_expand(gf, inpL); + kcpp_graph_compute_helper(gf, n_threads); //if (n_past%100 == 0) { // ggml_graph_print (&gf); diff --git a/otherarch/rwkv_v3.cpp b/otherarch/rwkv_v3.cpp index cbabab5a87cea..8ccc313cf57d4 100644 --- a/otherarch/rwkv_v3.cpp +++ b/otherarch/rwkv_v3.cpp @@ -700,7 +700,7 @@ struct rwkv_graph { struct ggml_tensor * tokens; // ggml_cgraph is so large that it can cause stack overflows if not stored on the heap - std::unique_ptr cgraph; + ggml_cgraph * cgraph; size_t pre_logits_nodes; size_t pre_logits_leafs; @@ -1520,13 +1520,13 @@ struct rwkv_context * rwkv_new_context_impl(std::shared_ptrmodel, serial_graph.tokens, inputs.get(), outputs.get(), logits, - serial_graph.cgraph.get(), + serial_graph.cgraph, &serial_graph.pre_logits_nodes, &serial_graph.pre_logits_leafs, &serial_graph.post_logits_nodes, &serial_graph.post_logits_leafs )); @@ -1638,7 +1638,7 @@ bool rwkv_eval(struct rwkv_context * ctx, const int n_threads, const uint32_t to ctx->serial_graph.cgraph->n_leafs = ctx->serial_graph.post_logits_leafs; } - kcpp_graph_compute_helper(ctx->serial_graph.cgraph.get(),n_threads); + kcpp_graph_compute_helper(ctx->serial_graph.cgraph,n_threads); rwkv_get_outputs(ctx, state_out, logits_out); return true; @@ -1698,13 +1698,13 @@ bool rwkv_eval_sequence(struct rwkv_context * ctx, const int n_threads, const ui sequence_graph.ctx = graph_future_ctx; RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_CTX | RWKV_ERROR_ALLOC, sequence_graph.ctx.ctx, "Failed to allocate sequence graph context"); sequence_graph.tokens = ggml_new_tensor_1d(sequence_graph.ctx.ctx, GGML_TYPE_I32, sequence_len); - sequence_graph.cgraph.reset(new(std::nothrow) struct ggml_cgraph()); + sequence_graph.cgraph = ggml_new_graph(sequence_graph.ctx.ctx); RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_ALLOC, sequence_graph.cgraph, "Failed to allocate sequence graph"); RWKV_ASSERT_FALSE(RWKV_ERROR_GRAPH, rwkv_build_sequence_graph( sequence_graph.ctx.ctx, ctx->instance->model, sequence_graph.tokens, ctx->input_layers.get(), ctx->output_layers.get(), ctx->logits, - sequence_graph.cgraph.get(), + sequence_graph.cgraph, &sequence_graph.pre_logits_nodes, &sequence_graph.pre_logits_leafs, &sequence_graph.post_logits_nodes, &sequence_graph.post_logits_leafs )); @@ -1726,7 +1726,7 @@ bool rwkv_eval_sequence(struct rwkv_context * ctx, const int n_threads, const ui ctx->sequence_graph.cgraph->n_leafs = ctx->sequence_graph.post_logits_leafs; } - kcpp_graph_compute_helper(ctx->sequence_graph.cgraph.get(),n_threads); + kcpp_graph_compute_helper(ctx->sequence_graph.cgraph,n_threads); rwkv_get_outputs(ctx, state_out, logits_out); }