refactor: support sd no tiling vae

Signed-off-by: thxCode <thxcode0824@gmail.com>
gpustack · Dec 2, 2024 · 0922277 · 0922277
1 parent e98828f
commit 0922277
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -207,7 +207,7 @@ general:
          --version                print version and exit
          --system-info            print system info and exit
          --list-devices           print list of available devices and exit
-  -v,    --verbose, --log-verbose
+  -v,    --verbose, --log-verbose 
                                   set verbosity level to infinity (i.e. log all messages, useful for debugging)
   -lv,   --verbosity, --log-verbosity V
                                   set the verbosity threshold, messages with a higher verbosity will be ignored
@@ -224,7 +224,7 @@ server:
   -m,    --model FILE             model path (default: models/7B/ggml-model-f16.gguf)
   -a,    --alias NAME             model name alias (default: unknown)
          --lora FILE              apply LoRA adapter (implies --no-mmap)
-         --lora-scaled FILE SCALE
+         --lora-scaled FILE SCALE 
                                   apply LoRA adapter with user defined scaling S (implies --no-mmap)
          --lora-init-without-apply
                                   load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled)
@@ -242,7 +242,7 @@ server:
 
 server/completion:
 
-  -dev,  --device <dev1,dev2,...>
+  -dev,  --device <dev1,dev2,...> 
                                   comma-separated list of devices to use for offloading (none = don't offload)
                                   use --list-devices to see a list of available devices
   -ngl,  --gpu-layers,  --n-gpu-layers N
@@ -263,25 +263,25 @@ server/completion:
          --slot-save-path PATH    path to save slot kv cache (default: disabled)
   -sps,  --slot-prompt-similarity N
                                   how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
-
+                                  
   -tps   --tokens-per-second N    maximum number of tokens per second (default: 0, 0 = disabled, -1 = try to detect)
                                   when enabled, limit the request within its X-Request-Tokens-Per-Second HTTP header
   -t,    --threads N              number of threads to use during generation (default: -1)
   -C,    --cpu-mask M             set CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "")
   -Cr,   --cpu-range lo-hi        range of CPUs for affinity. Complements --cpu-mask
          --cpu-strict <0|1>       use strict CPU placement (default: 0)
-
+                                  
          --prio N                 set process/thread priority (default: 0), one of:
                                     - 0-normal
                                     - 1-medium
                                     - 2-high
                                     - 3-realtime
          --poll <0...100>         use polling level to wait for work (0 - no polling, default: 50)
-
+                                  
   -tb,   --threads-batch N        number of threads to use during batch and prompt processing (default: same as --threads)
   -Cb,   --cpu-mask-batch M       set CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)
   -Crb,  --cpu-range-batch lo-hi  ranges of CPUs for affinity. Complements --cpu-mask-batch
-         --cpu-strict-batch <0|1>
+         --cpu-strict-batch <0|1> 
                                   use strict CPU placement (default: same as --cpu-strict)
          --prio-batch N           set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority)
          --poll-batch <0...100>   use polling to wait for work (default: same as --poll
@@ -311,7 +311,7 @@ server/completion:
          --dry-base N             set DRY sampling base value (default: 1.75)
          --dry-allowed-length N   set allowed length for DRY sampling (default: 2)
          --dry-penalty-last-n N   set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size)
-         --dry-sequence-breaker N
+         --dry-sequence-breaker N 
                                   add sequence breaker for DRY sampling, clearing out default breakers (
                                   ;:;";*) in the process; use "none" to not use any sequence breakers
          --dynatemp-range N       dynamic temperature range (default: 0.0, 0.0 = disabled)
@@ -400,13 +400,14 @@ server/images:
                                   path to the CLIP Large (clip-l) text encoder, or use --model included
          --image-clip-g-model PATH
                                   path to the CLIP Generic (clip-g) text encoder, or use --model included
-         --image-t5xxl-model PATH
+         --image-t5xxl-model PATH 
                                   path to the Text-to-Text Transfer Transformer (t5xxl) text encoder, or use --model included
          --image-no-vae-model-offload
                                   disable vae(taesd) model offload
          --image-vae-model PATH   path to Variational AutoEncoder (vae), or use --model included
          --image-vae-tiling       indicate to process vae decoder in tiles to reduce memory usage (default: disabled)
-         --image-taesd-model PATH
+         --image-no-vae-tiling    disable vae decoder in tiles
+         --image-taesd-model PATH 
                                   path to Tiny AutoEncoder For StableDiffusion (taesd), or use --model included
          --image-upscale-model PATH
                                   path to the upscale model, or use --model included

diff --git a/llama-box/param.hpp b/llama-box/param.hpp
@@ -320,6 +320,7 @@ static void llama_box_params_print_usage(int, char **argv, const llama_box_param
     }
     opts.push_back({ "server/images",                      "       --image-vae-model PATH",                 "path to Variational AutoEncoder (vae), or use --model included" });
     opts.push_back({ "server/images",                      "       --image-vae-tiling",                     "indicate to process vae decoder in tiles to reduce memory usage (default: %s)", sdparams.vae_tiling ? "enabled" : "disabled" });
+    opts.push_back({ "server/images",                      "       --image-no-vae-tiling",                  "disable vae decoder in tiles" });
     opts.push_back({ "server/images",                      "       --image-taesd-model PATH",               "path to Tiny AutoEncoder For StableDiffusion (taesd), or use --model included" });
     opts.push_back({ "server/images",                      "       --image-upscale-model PATH",             "path to the upscale model, or use --model included" });
     opts.push_back({ "server/images",                      "       --image-upscale-repeats N",              "how many times to run upscaler (default: %d)", sdparams.upscale_repeats });
@@ -1872,6 +1873,11 @@ static bool llama_box_params_parse(int argc, char **argv, llama_box_params &bpar
                 continue;
             }
 
+            if (!strcmp(flag, "--image-no-vae-tiling")) {
+                bparams.sdparams.vae_tiling = false;
+                continue;
+            }
+
             if (!strcmp(flag, "--image-taesd-model")) {
                 if (i == argc) {
                     missing("--image-taesd-model");