Merge remote-tracking branch 'upstream/concedo'

YellowRoseCx · Nov 18, 2023 · 8080c01 · 8080c01
2 parents 11aa596 + 22c56f9
commit 8080c01
Show file tree

Hide file tree

Showing 66 changed files with 5,615 additions and 3,382 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -254,6 +254,15 @@ if (LLAMA_LTO)
     endif()
 endif()
 
+# this version of Apple ld64 is buggy
+execute_process(
+    COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
+    ERROR_VARIABLE output
+)
+if (output MATCHES "dyld-1015\.7")
+    add_compile_definitions(HAVE_BUGGY_APPLE_LINKER)
+endif()
+
 # Architecture specific
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue
@@ -342,8 +351,12 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
     endif()
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
     message(STATUS "PowerPC detected")
-    add_compile_options(-mcpu=native -mtune=native)
-    #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
+    if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
+        add_compile_options(-mcpu=powerpc64le)
+    else()
+        add_compile_options(-mcpu=native -mtune=native)
+        #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
+    endif()
 else()
     message(STATUS "Unknown architecture")
 endif()

diff --git a/common/common.cpp b/common/common.cpp
@@ -1073,6 +1073,12 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
     return result;
 }
 
+bool llama_should_add_bos_token(const llama_model * model) {
+    const int add_bos = llama_add_bos_token(model);
+
+    return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
+}
+
 //
 // YAML utils
 //
@@ -1189,6 +1195,7 @@ void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const cha
     if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) {
         data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
         data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
+        data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
         data_str = "\"" + data_str + "\"";
         fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
         return;

diff --git a/common/common.h b/common/common.h
@@ -208,6 +208,10 @@ std::string llama_detokenize_bpe(
                          llama_context * ctx,
         const std::vector<llama_token> & tokens);
 
+// Uses the value from the model metadata if possible, otherwise
+// defaults to true when model type is SPM, otherwise false.
+bool llama_should_add_bos_token(const llama_model * model);
+
 //
 // YAML utils
 //

diff --git a/common/train.cpp b/common/train.cpp
@@ -32,6 +32,7 @@ struct train_state  * init_train_state() {
     state->opt = new struct ggml_opt_context;
     state->opt->ctx = NULL;
     state->opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
+    state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
     state->opt->loss_after = 0.0f;
 
     return state;
@@ -1135,6 +1136,7 @@ void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train
     fprintf(stderr, "  --adam-beta2 N             AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
     fprintf(stderr, "  --adam-gclip N             AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
     fprintf(stderr, "  --adam-epsf N              AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f);
+    fprintf(stderr, "  -ngl N, --n-gpu-layers N   Number of model layers to offload to GPU (default %d)", params->n_gpu_layers);
     fprintf(stderr, "\n");
 }
 
@@ -1354,6 +1356,17 @@ bool consume_common_train_arg(
             return true;
         }
         params->adam_gclip = std::stof(argv[i]);
+    } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
+            if (++i >= argc) {
+                *invalid_param = true;
+                return true;
+            }
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
+            params->n_gpu_layers = std::stoi(argv[i]);
+#else
+            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
+            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+#endif
     } else if (arg == "-h" || arg == "--help") {
         params->print_usage = true;
         return true;

diff --git a/common/train.h b/common/train.h
@@ -9,6 +9,8 @@
 #include "ggml.h"
 #include "llama.h"
 
+#define LLAMA_TRAIN_MAX_NODES 16384
+
 typedef std::string mt19937_state;
 
 struct train_state {

diff --git a/convert-baichuan-hf-to-gguf.py b/convert-baichuan-hf-to-gguf.py
@@ -6,17 +6,15 @@
 import argparse
 import json
 import os
-import struct
 import sys
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
-import itertools
 import numpy as np
 import torch
 from sentencepiece import SentencePieceProcessor  # type: ignore[import]
 
 if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
 
 

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
@@ -150,8 +150,6 @@ def load_hparams(dir_model):
 
     @staticmethod
     def from_model_architecture(model_architecture):
-        if model_architecture == "StableLMEpochForCausalLM":
-            return StableLMModel
         if model_architecture == "GPTNeoXForCausalLM":
             return GPTNeoXModel
         if model_architecture == "BloomForCausalLM":
@@ -168,6 +166,8 @@ def from_model_architecture(model_architecture):
             return RefactModel
         if model_architecture == "PersimmonForCausalLM":
             return PersimmonModel
+        if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
+            return StableLMModel
         return Model
 
     def _is_model_safetensors(self) -> bool:
@@ -193,14 +193,16 @@ def _get_model_architecture(self) -> gguf.MODEL_ARCH:
             return gguf.MODEL_ARCH.MPT
         if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
             return gguf.MODEL_ARCH.BAICHUAN
-        if arch == "FalconForCausalLM":
+        if arch in ("FalconForCausalLM", "RWForCausalLM"):
             return gguf.MODEL_ARCH.FALCON
         if arch == "GPTBigCodeForCausalLM":
             return gguf.MODEL_ARCH.STARCODER
         if arch == "GPTRefactForCausalLM":
             return gguf.MODEL_ARCH.REFACT
         if arch == "PersimmonForCausalLM":
             return gguf.MODEL_ARCH.PERSIMMON
+        if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
+            return gguf.MODEL_ARCH.STABLELM
 
         raise NotImplementedError(f'Architecture "{arch}" not supported!')
 
@@ -294,15 +296,6 @@ def _set_vocab_sentencepiece(self):
         special_vocab.add_to_gguf(self.gguf_writer)
 
 
-class StableLMModel(Model):
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        self.gguf_writer.add_rope_dimension_count(
-            int(self.hparams["rope_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])),
-        )
-        self.gguf_writer.add_layer_norm_eps(1e-5)
-
-
 class GPTNeoXModel(Model):
     def set_gguf_parameters(self):
         block_count = self.hparams["num_hidden_layers"]
@@ -824,6 +817,21 @@ def write_tensors(self):
             self.gguf_writer.add_tensor(new_name, data)
 
 
+class StableLMModel(Model):
+    def set_gguf_parameters(self):
+        hparams = self.hparams
+        block_count = hparams["num_hidden_layers"]
+
+        self.gguf_writer.add_name(dir_model.name)
+        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
+        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+        self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"]*(hparams["hidden_size"] // hparams["num_attention_heads"])))
+        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
+        self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
+        self.gguf_writer.add_layer_norm_eps(1e-5)
+
 ###### CONVERSION LOGIC ######
 
 def parse_args() -> argparse.Namespace:

diff --git a/convert-llama-ggml-to-gguf.py b/convert-llama-ggml-to-gguf.py
@@ -2,7 +2,6 @@
 from __future__ import annotations
 
 import argparse
-import math
 import struct
 import sys
 from enum import IntEnum
@@ -12,29 +11,9 @@
 
 import os
 if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
 
-# Note: Does not support GGML_QKK_64
-QK_K = 256
-# Items here are (block size, type size)
-GGML_QUANT_SIZES = {
-    gguf.GGMLQuantizationType.F32  : (1, 4),
-    gguf.GGMLQuantizationType.F16  : (1, 2),
-    gguf.GGMLQuantizationType.Q4_0 : (32, 2 + 16),
-    gguf.GGMLQuantizationType.Q4_1 : (32, 2 + 2 + 16),
-    gguf.GGMLQuantizationType.Q5_0 : (32, 2 + 4 + 16),
-    gguf.GGMLQuantizationType.Q5_1 : (32, 2 + 2 + 4 + 16),
-    gguf.GGMLQuantizationType.Q8_0 : (32, 2 + 32),
-    gguf.GGMLQuantizationType.Q8_1 : (32, 4 + 4 + 32),
-    gguf.GGMLQuantizationType.Q2_K : (256, 2 + 2 + QK_K // 16 + QK_K // 4),
-    gguf.GGMLQuantizationType.Q3_K : (256, 2 + QK_K // 4 + QK_K // 8 + 12),
-    gguf.GGMLQuantizationType.Q4_K : (256, 2 + 2 + QK_K // 2 + 12),
-    gguf.GGMLQuantizationType.Q5_K : (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
-    gguf.GGMLQuantizationType.Q6_K : (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
-    gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
-}
-
 class GGMLFormat(IntEnum):
     GGML = 0
     GGMF = 1
@@ -125,7 +104,7 @@ def load(self, data, offset):
         (n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
         assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
         assert name_len < 4096, 'Absurd tensor name length'
-        quant = GGML_QUANT_SIZES.get(dtype)
+        quant = gguf.GGML_QUANT_SIZES.get(dtype)
         assert quant is not None, 'Unknown tensor type'
         (blksize, tysize) = quant
         offset += 12

diff --git a/convert-persimmon-to-gguf.py b/convert-persimmon-to-gguf.py
@@ -6,7 +6,7 @@
 from pathlib import Path
 from sentencepiece import SentencePieceProcessor
 if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
 
 def _flatten_dict(dct, tensors, prefix=None):

diff --git a/convert.py b/convert.py
@@ -3,11 +3,9 @@
 
 import argparse
 import concurrent.futures
-import copy
 import enum
 import faulthandler
 import functools
-import io
 import itertools
 import json
 import math
@@ -23,14 +21,14 @@
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
-from typing import IO, TYPE_CHECKING, Any, Callable, Generator, Iterable, Literal, Sequence, TypeVar
+from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
 
 import numpy as np
 from sentencepiece import SentencePieceProcessor
 
 import os
 if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
 
 if TYPE_CHECKING:
@@ -692,6 +690,7 @@ def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
                               data_base_path=pickle_paths[0][:-4],
                               zip_file=zf)
     model = unpickler.load()
+    if 'model' in model: model = model['model']
     as_dict = dict(model.items())
     return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)
 
@@ -851,7 +850,7 @@ def add_meta_vocab(self, vocab: Vocab) -> None:
         elif isinstance(vocab, BpeVocab):
             self.gguf.add_tokenizer_model("gpt2")
         else:
-            raise ValueError(f'Unknown vocab type: Not BpeVocab or SentencePieceVocab')
+            raise ValueError('Unknown vocab type: Not BpeVocab or SentencePieceVocab')
         self.gguf.add_token_list(tokens)
         self.gguf.add_token_scores(scores)
         self.gguf.add_token_types(toktypes)
@@ -905,7 +904,7 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
         return dt.quantize(arr)
 
     @staticmethod
-    def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None:
+    def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
         check_vocab_size(params, vocab)
 
         of = OutputFile(fname_out, endianess=endianess)
@@ -1038,7 +1037,8 @@ def load_some_model(path: Path) -> ModelPlus:
     # Be extra-friendly and accept either a file or a directory:
     if path.is_dir():
         # Check if it's a set of safetensors files first
-        files = list(path.glob("model-00001-of-*.safetensors"))
+        globs = ["model-00001-of-*.safetensors", "model.safetensors"]
+        files = [file for glob in globs for file in path.glob(glob)]
         if not files:
             # Try the PyTorch patterns too, with lower priority
             globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
@@ -1114,14 +1114,18 @@ def do_dump_model(model_plus: ModelPlus) -> None:
 
 
 def main(args_in: list[str] | None = None) -> None:
+    output_choices = ["f32", "f16"]
+    if np.uint32(1) == np.uint32(1).newbyteorder("<"):
+        # We currently only support Q8_0 output on little endian systems.
+        output_choices.append("q8_0")
     parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
     parser.add_argument("--dump",        action="store_true",    help="don't convert, just show what's in the model")
     parser.add_argument("--dump-single", action="store_true",    help="don't convert, just show what's in a single model file")
     parser.add_argument("--vocab-only",  action="store_true",    help="extract only the vocab")
-    parser.add_argument("--outtype",     choices=["f32", "f16", "q8_0"], help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
+    parser.add_argument("--outtype",     choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
     parser.add_argument("--vocab-dir",   type=Path,              help="directory containing tokenizer.model, if separate from model file")
     parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
-    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
+    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin, *.safetensors)")
     parser.add_argument("--vocabtype",   choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
     parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
     parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -24,6 +24,7 @@ else()
     add_subdirectory(llama-bench)
     add_subdirectory(llava)
     add_subdirectory(main)
+    add_subdirectory(tokenize)
     add_subdirectory(parallel)
     add_subdirectory(perplexity)
     add_subdirectory(quantize)