Skip to content

Commit

Permalink
Write a lot of documentation and man pages
Browse files Browse the repository at this point in the history
  • Loading branch information
jart committed Aug 18, 2024
1 parent 7450034 commit 202e554
Show file tree
Hide file tree
Showing 27 changed files with 1,173 additions and 505 deletions.
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,13 @@ o/$(MODE)/: o/$(MODE)/llamafile \
# for installing to `make PREFIX=/usr/local`
.PHONY: install
install: llamafile/zipalign.1 \
llamafile/server/main.1 \
llama.cpp/main/main.1 \
llama.cpp/imatrix/imatrix.1 \
llama.cpp/quantize/quantize.1 \
llama.cpp/perplexity/perplexity.1 \
llama.cpp/llava/llava-quantize.1 \
whisper.cpp/main.1 \
o/$(MODE)/llamafile/zipalign \
o/$(MODE)/llamafile/tokenize \
o/$(MODE)/llama.cpp/main/main \
Expand All @@ -62,11 +64,13 @@ install: llamafile/zipalign.1 \
$(INSTALL) o/$(MODE)/whisper.cpp/main $(PREFIX)/bin/whisperfile
mkdir -p $(PREFIX)/share/man/man1
$(INSTALL) -m 0644 llamafile/zipalign.1 $(PREFIX)/share/man/man1/zipalign.1
$(INSTALL) -m 0644 llamafile/server/main.1 $(PREFIX)/share/man/man1/llamafiler.1
$(INSTALL) -m 0644 llama.cpp/main/main.1 $(PREFIX)/share/man/man1/llamafile.1
$(INSTALL) -m 0644 llama.cpp/imatrix/imatrix.1 $(PREFIX)/share/man/man1/llamafile-imatrix.1
$(INSTALL) -m 0644 llama.cpp/quantize/quantize.1 $(PREFIX)/share/man/man1/llamafile-quantize.1
$(INSTALL) -m 0644 llama.cpp/perplexity/perplexity.1 $(PREFIX)/share/man/man1/llamafile-perplexity.1
$(INSTALL) -m 0644 llama.cpp/llava/llava-quantize.1 $(PREFIX)/share/man/man1/llava-quantize.1
$(INSTALL) -m 0644 whisper.cpp/main.1 $(PREFIX)/share/man/man1/whisperfile.1

.PHONY: check
check: o/$(MODE)/llamafile/check
Expand Down
85 changes: 59 additions & 26 deletions llama.cpp/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -640,14 +640,20 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
if (arg == "--lora") {
CHECK_ARG
params.lora_adapter.emplace_back(argv[i], 1.0f);
params.lora_adapters.push_back({
std::string(argv[i]),
1.0,
});
return true;
}
if (arg == "--lora-scaled") {
CHECK_ARG
const char* lora_adapter = argv[i];
CHECK_ARG
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
params.lora_adapters.push_back({
lora_adapter,
std::stof(argv[i]),
});
return true;
}
if (arg == "--control-vector") {
Expand Down Expand Up @@ -1725,6 +1731,17 @@ std::string string_get_sortable_timestamp() {
return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
}

void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
if (search.empty()) {
return; // Avoid infinite loop if 'search' is an empty string
}
size_t pos = 0;
while ((pos = s.find(search, pos)) != std::string::npos) {
s.replace(pos, search.length(), replace);
pos += replace.length();
}
}

void string_process_escapes(std::string & input) {
std::size_t input_len = input.length();
std::size_t output_idx = 0;
Expand Down Expand Up @@ -1998,8 +2015,8 @@ std::string fs_get_cache_file(const std::string & filename) {
//
// Model utils
//

std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
llama_init_result iparams;
auto mparams = llama_model_params_from_gpt_params(params);

llama_model * model = nullptr;
Expand All @@ -2014,7 +2031,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par

if (model == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return std::make_tuple(nullptr, nullptr);
return iparams;
}

auto cparams = llama_context_params_from_gpt_params(params);
Expand All @@ -2023,7 +2040,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
if (lctx == NULL) {
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
llama_free_model(model);
return std::make_tuple(nullptr, nullptr);
return iparams;
}

if (!params.control_vectors.empty()) {
Expand All @@ -2034,7 +2051,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
if (cvec.n_embd == -1) {
llama_free(lctx);
llama_free_model(model);
return std::make_tuple(nullptr, nullptr);
return iparams;
}

int err = llama_control_vector_apply(lctx,
Expand All @@ -2046,21 +2063,26 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
if (err) {
llama_free(lctx);
llama_free_model(model);
return std::make_tuple(nullptr, nullptr);
return iparams;
}
}

for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
float lora_scale = std::get<1>(params.lora_adapter[i]);
auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
if (adapter == nullptr) {
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
// load and optionally apply lora adapters
for (auto & la : params.lora_adapters) {
llama_lora_adapter_container loaded_la;
loaded_la.path = la.path;
loaded_la.scale = la.scale;
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
if (loaded_la.adapter == nullptr) {
fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
llama_free(lctx);
llama_free_model(model);
return std::make_tuple(nullptr, nullptr);
return iparams;
}
llama_lora_adapter_set(lctx, adapter, lora_scale);
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
}
if (!params.lora_init_without_apply) {
llama_lora_adapters_apply(lctx, iparams.lora_adapters);
}

if (params.ignore_eos) {
Expand Down Expand Up @@ -2088,13 +2110,26 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
tmp.clear();
tmp.push_back(decoder_start_token_id);
}
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
if (llama_model_has_decoder(model)) {
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
}
llama_kv_cache_clear(lctx);
llama_synchronize(lctx);
llama_reset_timings(lctx);
}

return std::make_tuple(model, lctx);
iparams.model = model;
iparams.context = lctx;
return iparams;
}

void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters) {
llama_lora_adapter_clear(ctx);
for (auto & la : lora_adapters) {
if (la.scale != 0.0f) {
llama_lora_adapter_set(ctx, la.adapter, la.scale);
}
}
}

struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
Expand Down Expand Up @@ -3126,18 +3161,16 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
}

fprintf(stream, "lora:\n");
for (std::tuple<std::string, float> la : params.lora_adapter) {
if (std::get<1>(la) != 1.0f) {
continue;
for (auto & la : params.lora_adapters) {
if (la.scale == 1.0f) {
fprintf(stream, " - %s\n", la.path.c_str());
}
fprintf(stream, " - %s\n", std::get<0>(la).c_str());
}
fprintf(stream, "lora_scaled:\n");
for (std::tuple<std::string, float> la : params.lora_adapter) {
if (std::get<1>(la) == 1.0f) {
continue;
for (auto & la : params.lora_adapters) {
if (la.scale != 1.0f) {
fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale);
}
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
}
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
Expand Down
27 changes: 23 additions & 4 deletions llama.cpp/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,15 @@

#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"

struct llama_lora_adapter_info {
std::string path;
float scale;
};

struct llama_lora_adapter_container : llama_lora_adapter_info {
struct llama_lora_adapter * adapter;
};

// build info
extern int LLAMA_BUILD_NUMBER;
extern char const * LLAMA_COMMIT;
Expand Down Expand Up @@ -131,8 +140,8 @@ struct gpt_params {
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector<llama_model_kv_override> kv_overrides;

// TODO: avoid tuple, use struct
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale

std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale

Expand Down Expand Up @@ -282,6 +291,8 @@ std::vector<std::string> string_split(std::string input, char separator);
std::string string_strip(const std::string & str);
std::string string_get_sortable_timestamp();

void string_replace_all(std::string & s, const std::string & search, const std::string & replace);

template<class T>
static std::vector<T> string_split(const std::string & str, char delim) {
std::vector<T> values;
Expand Down Expand Up @@ -313,15 +324,23 @@ std::string fs_get_cache_file(const std::string & filename);
// Model utils
//

// TODO: avoid tuplue, use struct
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
struct llama_init_result {
struct llama_model * model = nullptr;
struct llama_context * context = nullptr;
std::vector<llama_lora_adapter_container> lora_adapters;
};

struct llama_init_result llama_init_from_gpt_params(gpt_params & params);

struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);

struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);

// clear LoRA adapters from context, then apply new list of adapters
void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);

// Batch utils

void llama_batch_clear(struct llama_batch & batch);
Expand Down
10 changes: 5 additions & 5 deletions llama.cpp/imatrix/imatrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -437,8 +437,8 @@ static void process_logits(
}

static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
const int n_ctx = llama_n_ctx(ctx);

auto tim1 = std::chrono::high_resolution_clock::now();
Expand Down Expand Up @@ -629,10 +629,10 @@ int main(int argc, char ** argv) {
params.warmup = false;

// init
llama_model * model;
llama_context * ctx;
llama_init_result llama_init = llama_init_from_gpt_params(params);

std::tie(model, ctx) = llama_init_from_gpt_params(params);
llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
if (model == nullptr || ctx == nullptr) {
fprintf(stderr, "%s : failed to init\n", __func__);
return 1;
Expand Down
8 changes: 4 additions & 4 deletions llama.cpp/main/embedding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,11 @@ int embedding_cli(int argc, char ** argv) {
llama_backend_init();
llama_numa_init(params.numa);

llama_model * model;
llama_context * ctx;

// load the model
std::tie(model, ctx) = llama_init_from_gpt_params(params);
llama_init_result llama_init = llama_init_from_gpt_params(params);

llama_model * model = llama_init.model;
llama_context * ctx = llama_init.context;
if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
return 1;
Expand Down
Loading

0 comments on commit 202e554

Please sign in to comment.