Skip to content

Commit

Permalink
refactor: clarify params
Browse files Browse the repository at this point in the history
Signed-off-by: thxCode <thxcode0824@gmail.com>
  • Loading branch information
thxCode committed Nov 26, 2024
1 parent 46f134a commit 0bafa78
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 16 deletions.
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ server/completion:
--frequency-penalty N repeat alpha frequency penalty (default: 0.0, 0.0 = disabled)
--dry-multiplier N set DRY sampling multiplier (default: 0.0, 0.0 = disabled)
--dry-base N set DRY sampling base value (default: 1.75)
--dry--allowed-length N set allowed length for DRY sampling (default: 2)
--dry-allowed-length N set allowed length for DRY sampling (default: 2)
--dry-penalty-last-n N set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size)
--dry-sequence-breaker N
add sequence breaker for DRY sampling, clearing out default breakers (
Expand All @@ -332,13 +332,12 @@ server/completion:
--yarn-beta-fast N YaRN: low correction dim or beta (default: 32.0)
--yarn-beta-slow N YaRN: high correction dim or alpha (default: 1.0)
-nkvo, --no-kv-offload disable KV offload
--cache-prompt enable caching prompt (default: enabled)
--cache-reuse N min chunk size to attempt reusing from the cache via KV shifting, implicit --cache-prompt if value (default: 0)
--no-cache-prompt disable caching prompt
--cache-reuse N min chunk size to attempt reusing from the cache via KV shifting (default: 0)
-ctk, --cache-type-k TYPE KV cache data type for K (default: f16)
-ctv, --cache-type-v TYPE KV cache data type for V (default: f16)
-dt, --defrag-thold N KV cache defragmentation threshold (default: 0.1, < 0 - disabled)
-np, --parallel N number of parallel sequences to decode (default: 1)
-cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: enabled)
-nocb, --no-cont-batching disable continuous batching
--mmproj FILE path to a multimodal projector file for LLaVA
--mlock force system to keep model in RAM rather than swapping or compressing
Expand Down Expand Up @@ -436,6 +435,8 @@ Available environment variables (if the corresponding command-line option is not
- `LLAMA_ARG_DEVICE`: equivalent to `-dev`, `--device`.
- `LLAMA_ARG_N_GPU_LAYERS`: equivalent to `-ngl`, `--gpu-layers`, `--n-gpu-layers`.
- `LLAMA_ARG_THREADS_HTTP`: equivalent to `--threads-http`
- `LLAMA_ARG_CACHE_PROMPT`: if set to `0`, it will **disable** caching prompt (equivalent
to `--no-cache-prompt`). This feature is enabled by default.
- `LLAMA_ARG_CACHE_REUSE`: equivalent to `--cache-reuse`
- `LLAMA_ARG_CHAT_TEMPLATE`: equivalent to `--chat-template`
- `LLAMA_ARG_N_PREDICT`: equivalent to `-n`, `--predict`.
Expand Down
17 changes: 6 additions & 11 deletions llama-box/param.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ static void llama_box_params_print_usage(int, char **argv, const llama_box_param
opts.push_back({ "server/completion", " --frequency-penalty N", "repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)sampling.penalty_freq });
opts.push_back({ "server/completion", " --dry-multiplier N", "set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sampling.dry_multiplier });
opts.push_back({ "server/completion", " --dry-base N", "set DRY sampling base value (default: %.2f)", (double)params.sampling.dry_base });
opts.push_back({ "server/completion", " --dry--allowed-length N", "set allowed length for DRY sampling (default: %d)", params.sampling.dry_allowed_length });
opts.push_back({ "server/completion", " --dry-allowed-length N", "set allowed length for DRY sampling (default: %d)", params.sampling.dry_allowed_length });
opts.push_back({ "server/completion", " --dry-penalty-last-n N", "set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n });
opts.push_back({ "server/completion", " --dry-sequence-breaker N", "add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers", default_dry_sequence_breaker_names.c_str() });
opts.push_back({ "server/completion", " --dynatemp-range N", "dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)sampling.dynatemp_range });
Expand All @@ -248,13 +248,12 @@ static void llama_box_params_print_usage(int, char **argv, const llama_box_param
if (llama_supports_gpu_offload()) {
opts.push_back({ "server/completion", "-nkvo, --no-kv-offload", "disable KV offload" });
}
opts.push_back({ "server/completion", " --cache-prompt", "enable caching prompt (default: %s)", bparams.cache_prompt ? "enabled" : "disabled" });
opts.push_back({ "server/completion", " --cache-reuse N", "min chunk size to attempt reusing from the cache via KV shifting, implicit --cache-prompt if value (default: %d)", params.n_cache_reuse });
opts.push_back({ "server/completion", " --no-cache-prompt", "disable caching prompt" });
opts.push_back({ "server/completion", " --cache-reuse N", "min chunk size to attempt reusing from the cache via KV shifting (default: %d)", params.n_cache_reuse });
opts.push_back({ "server/completion", "-ctk, --cache-type-k TYPE", "KV cache data type for K (default: %s)", params.cache_type_k.c_str() });
opts.push_back({ "server/completion", "-ctv, --cache-type-v TYPE", "KV cache data type for V (default: %s)", params.cache_type_v.c_str() });
opts.push_back({ "server/completion", "-dt, --defrag-thold N", "KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold });
opts.push_back({ "server/completion", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel });
opts.push_back({ "server/completion", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" });
opts.push_back({ "server/completion", "-nocb, --no-cont-batching", "disable continuous batching" });
opts.push_back({ "server/completion", " --mmproj FILE", "path to a multimodal projector file for LLaVA" });
if (llama_supports_mlock()) {
Expand Down Expand Up @@ -1383,8 +1382,8 @@ static bool llama_box_params_parse(int argc, char **argv, llama_box_params &bpar
continue;
}

if (!strcmp(flag, "--cache-prompt")) {
bparams.cache_prompt = true;
if (!strcmp(flag, "--no-cache-prompt")) {
bparams.cache_prompt = false;
continue;
}

Expand Down Expand Up @@ -1436,11 +1435,6 @@ static bool llama_box_params_parse(int argc, char **argv, llama_box_params &bpar
continue;
}

if (!strcmp(flag, "-cb") || !strcmp(flag, "--cont-batching")) {
bparams.gparams.cont_batching = true;
continue;
}

if (!strcmp(flag, "-nocb") || !strcmp(flag, "--no-cont-batching")) {
bparams.gparams.cont_batching = false;
continue;
Expand Down Expand Up @@ -1949,6 +1943,7 @@ static bool llama_box_params_parse(int argc, char **argv, llama_box_params &bpar
get_env("LLAMA_ARG_DEVICE", bparams.gparams.devices);
get_env("LLAMA_ARG_N_GPU_LAYERS", bparams.gparams.n_gpu_layers);
get_env("LLAMA_ARG_THREADS_HTTP", bparams.gparams.n_threads_http);
get_env("LLAMA_ARG_CACHE_PROMPT", bparams.cache_prompt);
get_env("LLAMA_ARG_CACHE_REUSE", bparams.gparams.n_cache_reuse);
get_env("LLAMA_ARG_CHAT_TEMPLATE", bparams.gparams.chat_template);
get_env("LLAMA_ARG_N_PREDICT", bparams.gparams.n_predict);
Expand Down
2 changes: 1 addition & 1 deletion llama-box/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1068,7 +1068,7 @@ struct server_context {
server_slot *ret = nullptr;

// find the slot that has at least n% prompt similarity
if (ret == nullptr && slot_prompt_similarity != 0.0f) {
if (slot_prompt_similarity != 0.0f) {
int lcs_len = 0;
float similarity = 0;

Expand Down

0 comments on commit 0bafa78

Please sign in to comment.