From 0ae11ea6df2ca038d2aaa319a290f928c7c1619a Mon Sep 17 00:00:00 2001 From: carlson-svg Date: Fri, 9 Aug 2024 15:10:08 -0700 Subject: [PATCH 1/8] v0 worker-config --- worker-config.json | 1032 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1032 insertions(+) create mode 100644 worker-config.json diff --git a/worker-config.json b/worker-config.json new file mode 100644 index 0000000..4f96a78 --- /dev/null +++ b/worker-config.json @@ -0,0 +1,1032 @@ +{ + "0.5.3": { + "categories": [ + { + "title": "LLM Settings", + "settings": [ + { + "TOKENIZER": { + "value": "", + "title": "Tokenizer", + "description": "Name or path of the Hugging Face tokenizer to use.", + "required": false, + "type": "text", + "category": "LLM Settings" + }, + "TOKENIZER_MODE": { + "value": "auto", + "title": "Tokenizer Mode", + "description": "The tokenizer mode.", + "required": false, + "type": "select", + "category": "LLM Settings", + "options": [ + { "value": "auto", "label": "auto" }, + { "value": "slow", "label": "slow" } + ] + }, + "SKIP_TOKENIZER_INIT": { + "value": false, + "title": "Skip Tokenizer Init", + "description": "Skip initialization of tokenizer and detokenizer.", + "required": false, + "type": "toggle", + "category": "LLM Settings" + }, + "TRUST_REMOTE_CODE": { + "value": false, + "title": "Trust Remote Code", + "description": "Trust remote code from Hugging Face.", + "required": false, + "type": "toggle", + "category": "LLM Settings" + }, + "DOWNLOAD_DIR": { + "value": "", + "title": "Download Directory", + "description": "Directory to download and load the weights.", + "required": false, + "type": "text", + "category": "LLM Settings" + }, + "LOAD_FORMAT": { + "value": "auto", + "title": "Load Format", + "description": "The format of the model weights to load.", + "required": false, + "type": "select", + "category": "LLM Settings", + "options": [ + { "value": "auto", "label": "auto" }, + { "value": "pt", "label": "pt" }, + { "value": "safetensors", "label": "safetensors" }, + { "value": "npcache", "label": "npcache" }, + { "value": "dummy", "label": "dummy" }, + { "value": "tensorizer", "label": "tensorizer" }, + { "value": "bitsandbytes", "label": "bitsandbytes" } + ] + }, + "DTYPE": { + "value": "auto", + "title": "Data Type", + "description": "Data type for model weights and activations.", + "required": false, + "type": "select", + "category": "LLM Settings", + "options": [ + { "value": "auto", "label": "auto" }, + { "value": "half", "label": "half" }, + { "value": "float16", "label": "float16" }, + { "value": "bfloat16", "label": "bfloat16" }, + { "value": "float", "label": "float" }, + { "value": "float32", "label": "float32" } + ] + }, + "KV_CACHE_DTYPE": { + "value": "auto", + "title": "KV Cache Data Type", + "description": "Data type for KV cache storage.", + "required": false, + "type": "select", + "category": "LLM Settings", + "options": [ + { "value": "auto", "label": "auto" }, + { "value": "fp8", "label": "fp8" } + ] + }, + "QUANTIZATION_PARAM_PATH": { + "value": "", + "title": "Quantization Param Path", + "description": "Path to the JSON file containing the KV cache scaling factors.", + "required": false, + "type": "text", + "category": "LLM Settings" + }, + "MAX_MODEL_LEN": { + "value": "", + "title": "Max Model Length", + "description": "Model context length.", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "GUIDED_DECODING_BACKEND": { + "value": "outlines", + "title": "Guided Decoding Backend", + "description": "Which engine will be used for guided decoding by default.", + "required": false, + "type": "select", + "category": "LLM Settings", + "options": [ + { "value": "outlines", "label": "outlines" }, + { "value": "lm-format-enforcer", "label": "lm-format-enforcer" } + ] + }, + "DISTRIBUTED_EXECUTOR_BACKEND": { + "value": "", + "title": "Distributed Executor Backend", + "description": "Backend to use for distributed serving.", + "required": false, + "type": "select", + "category": "LLM Settings", + "options": [ + { "value": "ray", "label": "ray" }, + { "value": "mp", "label": "mp" } + ] + }, + "WORKER_USE_RAY": { + "value": false, + "title": "Worker Use Ray", + "description": "Deprecated, use --distributed-executor-backend=ray.", + "required": false, + "type": "toggle", + "category": "LLM Settings" + }, + "RAY_WORKERS_USE_NSIGHT": { + "value": false, + "title": "Ray Workers Use Nsight", + "description": "If specified, use nsight to profile Ray workers.", + "required": false, + "type": "toggle", + "category": "LLM Settings" + }, + "PIPELINE_PARALLEL_SIZE": { + "value": 1, + "title": "Pipeline Parallel Size", + "description": "Number of pipeline stages.", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "TENSOR_PARALLEL_SIZE": { + "value": 1, + "title": "Tensor Parallel Size", + "description": "Number of tensor parallel replicas.", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "MAX_PARALLEL_LOADING_WORKERS": { + "value": "", + "title": "Max Parallel Loading Workers", + "description": "Load model sequentially in multiple batches.", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "ENABLE_PREFIX_CACHING": { + "value": false, + "title": "Enable Prefix Caching", + "description": "Enables automatic prefix caching.", + "required": false, + "type": "toggle", + "category": "LLM Settings" + }, + "DISABLE_SLIDING_WINDOW": { + "value": false, + "title": "Disable Sliding Window", + "description": "Disables sliding window, capping to sliding window size.", + "required": false, + "type": "toggle", + "category": "LLM Settings" + }, + "USE_V2_BLOCK_MANAGER": { + "value": false, + "title": "Use V2 Block Manager", + "description": "Use BlockSpaceMangerV2.", + "required": false, + "type": "toggle", + "category": "LLM Settings" + }, + "NUM_LOOKAHEAD_SLOTS": { + "value": 0, + "title": "Num Lookahead Slots", + "description": "Experimental scheduling config necessary for speculative decoding.", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "SEED": { + "value": 0, + "title": "Seed", + "description": "Random seed for operations.", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "NUM_GPU_BLOCKS_OVERRIDE": { + "value": "", + "title": "Num GPU Blocks Override", + "description": "If specified, ignore GPU profiling result and use this number of GPU blocks.", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "MAX_NUM_BATCHED_TOKENS": { + "value": "", + "title": "Max Num Batched Tokens", + "description": "Maximum number of batched tokens per iteration.", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "MAX_NUM_SEQS": { + "value": 256, + "title": "Max Num Seqs", + "description": "Maximum number of sequences per iteration.", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "MAX_LOGPROBS": { + "value": 20, + "title": "Max Logprobs", + "description": "Max number of log probs to return when logprobs is specified in SamplingParams.", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "DISABLE_LOG_STATS": { + "value": false, + "title": "Disable Log Stats", + "description": "Disable logging statistics.", + "required": false, + "type": "toggle", + "category": "LLM Settings" + }, + "QUANTIZATION": { + "value": "", + "title": "Quantization", + "description": "Method used to quantize the weights.", + "required": false, + "type": "select", + "category": "LLM Settings", + "options": [ + { "value": "None", "label": "None" }, + { "value": "awq", "label": "AWQ" }, + { "value": "squeezellm", "label": "SqueezeLLM" }, + { "value": "gptq", "label": "GPTQ" } + ] + }, + "ROPE_SCALING": { + "value": "", + "title": "RoPE Scaling", + "description": "RoPE scaling configuration in JSON format.", + "required": false, + "type": "text", + "category": "LLM Settings" + }, + "ROPE_THETA": { + "value": "", + "title": "RoPE Theta", + "description": "RoPE theta. Use with rope_scaling.", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "TOKENIZER_POOL_SIZE": { + "value": 0, + "title": "Tokenizer Pool Size", + "description": "Size of tokenizer pool to use for asynchronous tokenization.", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "TOKENIZER_POOL_TYPE": { + "value": "ray", + "title": "Tokenizer Pool Type", + "description": "Type of tokenizer pool to use for asynchronous tokenization.", + "required": false, + "type": "text", + "category": "LLM Settings" + }, + "TOKENIZER_POOL_EXTRA_CONFIG": { + "value": "", + "title": "Tokenizer Pool Extra Config", + "description": "Extra config for tokenizer pool.", + "required": false, + "type": "text", + "category": "LLM Settings" + }, + "ENABLE_LORA": { + "value": false, + "title": "Enable LoRA", + "description": "If True, enable handling of LoRA adapters.", + "required": false, + "type": "toggle", + "category": "LLM Settings" + }, + "MAX_LORAS": { + "value": 1, + "title": "Max LoRAs", + "description": "Max number of LoRAs in a single batch.", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "MAX_LORA_RANK": { + "value": 16, + "title": "Max LoRA Rank", + "description": "Max LoRA rank.", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "LORA_EXTRA_VOCAB_SIZE": { + "value": 256, + "title": "LoRA Extra Vocab Size", + "description": "Maximum size of extra vocabulary for LoRA adapters.", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "LORA_DTYPE": { + "value": "auto", + "title": "LoRA Data Type", + "description": "Data type for LoRA.", + "required": false, + "type": "select", + "category": "LLM Settings", + "options": [ + { "value": "auto", "label": "auto" }, + { "value": "float16", "label": "float16" }, + { "value": "bfloat16", "label": "bfloat16" }, + { "value": "float32", "label": "float32" } + ] + }, + "LONG_LORA_SCALING_FACTORS": { + "value": "", + "title": "Long LoRA Scaling Factors", + "description": "Specify multiple scaling factors for LoRA adapters.", + "required": false, + "type": "text", + "category": "LLM Settings" + }, + "MAX_CPU_LORAS": { + "value": "", + "title": "Max CPU LoRAs", + "description": "Maximum number of LoRAs to store in CPU memory.", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "FULLY_SHARDED_LORAS": { + "value": false, + "title": "Fully Sharded LoRAs", + "description": "Enable fully sharded LoRA layers.", + "required": false, + "type": "toggle", + "category": "LLM Settings" + }, + "DEVICE": { + "value": "auto", + "title": "Device", + "description": "Device type for vLLM execution.", + "required": false, + "type": "select", + "category": "LLM Settings", + "options": [ + { "value": "auto", "label": "auto" }, + { "value": "cuda", "label": "cuda" }, + { "value": "neuron", "label": "neuron" }, + { "value": "cpu", "label": "cpu" }, + { "value": "openvino", "label": "openvino" }, + { "value": "tpu", "label": "tpu" }, + { "value": "xpu", "label": "xpu" } + ] + }, + "SCHEDULER_DELAY_FACTOR": { + "value": 0.0, + "title": "Scheduler Delay Factor", + "description": "Apply a delay before scheduling next prompt.", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "ENABLE_CHUNKED_PREFILL": { + "value": false, + "title": "Enable Chunked Prefill", + "description": "Enable chunked prefill requests.", + "required": false, + "type": "toggle", + "category": "LLM Settings" + }, + "SPECULATIVE_MODEL": { + "value": "", + "title": "Speculative Model", + "description": "The name of the draft model to be used in speculative decoding.", + "required": false, + "type": "text", + "category": "LLM Settings" + }, + "NUM_SPECULATIVE_TOKENS": { + "value": "", + "title": "Num Speculative Tokens", + "description": "The number of speculative tokens to sample from the draft model.", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE": { + "value": "", + "title": "Speculative Draft Tensor Parallel Size", + "description": "Number of tensor parallel replicas for the draft model.", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "SPECULATIVE_MAX_MODEL_LEN": { + "value": "", + "title": "Speculative Max Model Length", + "description": "The maximum sequence length supported by the draft model.", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "SPECULATIVE_DISABLE_BY_BATCH_SIZE": { + "value": "", + "title": "Speculative Disable by Batch Size", + "description": "Disable speculative decoding if the number of enqueue requests is larger than this value.", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "NGRAM_PROMPT_LOOKUP_MAX": { + "value": "", + "title": "Ngram Prompt Lookup Max", + "description": "Max size of window for ngram prompt lookup in speculative decoding.", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "NGRAM_PROMPT_LOOKUP_MIN": { + "value": "", + "title": "Ngram Prompt Lookup Min", + "description": "Min size of window for ngram prompt lookup in speculative decoding.", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "SPEC_DECODING_ACCEPTANCE_METHOD": { + "value": "rejection_sampler", + "title": "Speculative Decoding Acceptance Method", + "description": "Specify the acceptance method for draft token verification in speculative decoding.", + "required": false, + "type": "select", + "category": "LLM Settings", + "options": [ + { "value": "rejection_sampler", "label": "rejection_sampler" }, + { "value": "typical_acceptance_sampler", "label": "typical_acceptance_sampler" } + ] + }, + "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD": { + "value": "", + "title": "Typical Acceptance Sampler Posterior Threshold", + "description": "Set the lower bound threshold for the posterior probability of a token to be accepted.", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA": { + "value": "", + "title": "Typical Acceptance Sampler Posterior Alpha", + "description": "A scaling factor for the entropy-based threshold for token acceptance.", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "MODEL_LOADER_EXTRA_CONFIG": { + "value": "", + "title": "Model Loader Extra Config", + "description": "Extra config for model loader.", + "required": false, + "type": "text", + "category": "LLM Settings" + }, + "PREEMPTION_MODE": { + "value": "", + "title": "Preemption Mode", + "description": "If 'recompute', the engine performs preemption-aware recomputation. If 'save', the engine saves activations into the CPU memory as preemption happens.", + "required": false, + "type": "text", + "category": "LLM Settings" + }, + "PREEMPTION_CHECK_PERIOD": { + "value": 1.0, + "title": "Preemption Check Period", + "description": "How frequently the engine checks if a preemption happens.", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "PREEMPTION_CPU_CAPACITY": { + "value": 2, + "title": "Preemption CPU Capacity", + "description": "The percentage of CPU memory used for the saved activations.", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "MAX_LOG_LEN": { + "value": "", + "title": "Max Log Length", + "description": "Max number of characters or ID numbers being printed in log.", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "DISABLE_LOGGING_REQUEST": { + "value": false, + "title": "Disable Logging Request", + "description": "Disable logging requests.", + "required": false, + "type": "toggle", + "category": "LLM Settings" + } + } + ] + }, + { + "title": "Tokenizer Settings", + "settings": [ + { + "TOKENIZER_NAME": { + "value": "", + "title": "Tokenizer Name", + "description": "Tokenizer repo to use a different tokenizer than the model's default", + "required": false, + "type": "text", + "category": "Tokenizer Settings" + }, + "TOKENIZER_REVISION": { + "value": "", + "title": "Tokenizer Revision", + "description": "Tokenizer revision to load", + "required": false, + "type": "text", + "category": "Tokenizer Settings" + }, + "CUSTOM_CHAT_TEMPLATE": { + "value": "", + "title": "Custom Chat Template", + "description": "Custom chat jinja template", + "required": false, + "type": "text", + "category": "Tokenizer Settings" + } + } + ] + }, + { + "title": "System Settings", + "settings": [ + { + "GPU_MEMORY_UTILIZATION": { + "value": "0.95", + "title": "GPU Memory Utilization", + "description": "Sets GPU VRAM utilization", + "required": false, + "type": "number", + "category": "System Settings" + }, + "MAX_PARALLEL_LOADING_WORKERS": { + "value": "", + "title": "Max Parallel Loading Workers", + "description": "Load model sequentially in multiple batches. Leave empty for auto", + "required": false, + "type": "number", + "category": "System Settings" + }, + "BLOCK_SIZE": { + "value": "16", + "title": "Block Size", + "description": "Token block size for contiguous chunks of tokens", + "required": false, + "type": "number", + "category": "System Settings" + }, + "SWAP_SPACE": { + "value": "4", + "title": "Swap Space", + "description": "CPU swap space size (GiB) per GPU", + "required": false, + "type": "number", + "category": "System Settings" + }, + "ENFORCE_EAGER": { + "value": false, + "title": "Enforce Eager", + "description": "Always use eager-mode PyTorch. If False (0), will use eager mode and CUDA graph in hybrid for maximal performance and flexibility", + "required": false, + "type": "toggle", + "category": "System Settings" + }, + "MAX_SEQ_LEN_TO_CAPTURE": { + "value": "8192", + "title": "CUDA Graph Max Content Length", + "description": "Maximum context length covered by CUDA graphs. If a sequence has context length larger than this, we fall back to eager mode", + "required": false, + "type": "number", + "category": "System Settings" + }, + "DISABLE_CUSTOM_ALL_REDUCE": { + "value": false, + "title": "Disable Custom All Reduce", + "description": "Enables or disables custom all reduce", + "required": false, + "type": "toggle", + "category": "System Settings" + } + } + ] + }, + { + "title": "Streaming Settings", + "settings": [ + { + "DEFAULT_BATCH_SIZE": { + "value": "50", + "title": "Default Final Batch Size", + "description": "Default and Maximum batch size for token streaming to reduce HTTP calls", + "required": false, + "type": "number", + "category": "Streaming Settings" + }, + "DEFAULT_MIN_BATCH_SIZE": { + "value": "1", + "title": "Default Starting Batch Size", + "description": "Batch size for the first request, which will be multiplied by the growth factor every subsequent request", + "required": false, + "type": "number", + "category": "Streaming Settings" + }, + "DEFAULT_BATCH_SIZE_GROWTH_FACTOR": { + "value": "3", + "title": "Default Batch Size Growth Factor", + "description": "Growth factor for dynamic batch size", + "required": false, + "type": "number", + "category": "Streaming Settings" + } + } + ] + }, + { + "title": "OpenAI Settings", + "settings": [ + { + "RAW_OPENAI_OUTPUT": { + "value": true, + "title": "Raw OpenAI Output", + "description": "Raw OpenAI output instead of just the text", + "required": false, + "type": "toggle", + "category": "OpenAI Settings" + }, + "OPENAI_RESPONSE_ROLE": { + "value": "assistant", + "title": "OpenAI Response Role", + "description": "Role of the LLM's Response in OpenAI Chat Completions", + "required": false, + "type": "text", + "category": "OpenAI Settings" + }, + "OPENAI_SERVED_MODEL_NAME_OVERRIDE": { + "value": "", + "title": "OpenAI Served Model Name Override", + "description": "Overrides the name of the served model from model repo/path to specified name, which you will then be able to use the value for the `model` parameter when making OpenAI requests", + "required": false, + "type": "text", + "category": "OpenAI Settings" + } + } + ] + }, + { + "title": "Serverless Settings", + "settings": [ + { + "MAX_CONCURRENCY": { + "value": "300", + "title": "Max Concurrency", + "description": "Max concurrent requests per worker. vLLM has an internal queue, so you don't have to worry about limiting by VRAM, this is for improving scaling/load balancing efficiency", + "required": false, + "type": "number", + "category": "Serverless Settings" + }, + "DISABLE_LOG_STATS": { + "value": true, + "title": "Disable Log Stats", + "description": "Enables or disables vLLM stats logging", + "required": false, + "type": "toggle", + "category": "Serverless Settings" + }, + "DISABLE_LOG_REQUESTS": { + "value": true, + "title": "Disable Log Requests", + "description": "Enables or disables vLLM request logging", + "required": false, + "type": "toggle", + "category": "Serverless Settings" + } + } + ] + } + ] + }, + "0.4.2": { + "categories": [ + { + "title": "LLM Settings", + "settings": [ + { + "MODEL_REVISION": { + "value": "", + "title": "Model Revision", + "description": "Model revision (branch) to load", + "required": false, + "type": "text", + "category": "LLM Settings" + }, + "MAX_MODEL_LEN": { + "value": "", + "title": "Max Model Length", + "description": "Maximum number of tokens for the engine to handle per request", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "BASE_PATH": { + "value": "/runpod-volume", + "title": "Base Path", + "description": "Storage directory for Huggingface cache and model", + "required": false, + "type": "text", + "category": "LLM Settings" + }, + "LOAD_FORMAT": { + "value": "auto", + "title": "Load Format", + "description": "Format to load model in", + "required": false, + "type": "select", + "category": "LLM Settings", + "options": [ + { "value": "auto", "label": "auto" }, + { "value": ".safetensors", "label": ".safetensors" }, + { "value": ".bin", "label": ".bin" }, + { "value": ".pt", "label": ".pt" } + ] + }, + "QUANTIZATION": { + "value": "None", + "title": "Quantization", + "description": "Quantization of given model. The model must already be quantized", + "required": false, + "type": "select", + "category": "LLM Settings", + "options": [ + { "value": "None", "label": "None" }, + { "value": "awq", "label": "AWQ" }, + { "value": "squeezellm", "label": "SqueezeLLM" }, + { "value": "gptq", "label": "GPTQ" } + ] + }, + "TRUST_REMOTE_CODE": { + "value": "0", + "title": "Trust Remote Code", + "description": "Trust remote code for HuggingFace models", + "required": false, + "type": "toggle", + "category": "LLM Settings" + }, + "SEED": { + "value": "", + "title": "Seed", + "description": "Sets random seed for operations", + "required": false, + "type": "number", + "category": "LLM Settings" + }, + "KV_CACHE_DTYPE": { + "value": "auto", + "title": "KV Cache Data Type", + "description": "Data type for kv cache storage. Uses DTYPE if set to auto", + "required": false, + "type": "select", + "category": "LLM Settings", + "options": [ + { "value": "auto", "label": "auto" }, + { "value": "fp8_e5m2", "label": "fp8_e5m2" } + ] + }, + "DTYPE": { + "value": "auto", + "title": "Weights Datatype/Precision", + "description": "Sets datatype/precision for model weights and activations", + "required": false, + "type": "select", + "category": "LLM Settings", + "options": [ + { "value": "auto", "label": "auto" }, + { "value": "half", "label": "half" }, + { "value": "float16", "label": "float16" }, + { "value": "bfloat16", "label": "bfloat16" }, + { "value": "float", "label": "float" }, + { "value": "float32", "label": "float32" } + ] + } + } + ] + }, + { + "title": "Tokenizer Settings", + "settings": [ + { + "TOKENIZER_NAME": { + "value": "", + "title": "Tokenizer Name", + "description": "Tokenizer repo to use a different tokenizer than the model's default", + "required": false, + "type": "text", + "category": "Tokenizer Settings" + }, + "TOKENIZER_REVISION": { + "value": "", + "title": "Tokenizer Revision", + "description": "Tokenizer revision to load", + "required": false, + "type": "text", + "category": "Tokenizer Settings" + }, + "CUSTOM_CHAT_TEMPLATE": { + "value": "", + "title": "Custom Chat Template", + "description": "Custom chat jinja template", + "required": false, + "type": "text", + "category": "Tokenizer Settings" + } + } + ] + }, + { + "title": "System Settings", + "settings": [ + { + "GPU_MEMORY_UTILIZATION": { + "value": "0.95", + "title": "GPU Memory Utilization", + "description": "Sets GPU VRAM utilization", + "required": false, + "type": "number", + "category": "System Settings" + }, + "MAX_PARALLEL_LOADING_WORKERS": { + "value": "", + "title": "Max Parallel Loading Workers", + "description": "Load model sequentially in multiple batches. Leave empty for auto", + "required": false, + "type": "number", + "category": "System Settings" + }, + "BLOCK_SIZE": { + "value": "16", + "title": "Block Size", + "description": "Token block size for contiguous chunks of tokens", + "required": false, + "type": "number", + "category": "System Settings" + }, + "SWAP_SPACE": { + "value": "4", + "title": "Swap Space", + "description": "CPU swap space size (GiB) per GPU", + "required": false, + "type": "number", + "category": "System Settings" + }, + "ENFORCE_EAGER": { + "value": "0", + "title": "Enforce Eager", + "description": "Always use eager-mode PyTorch. If False (0), will use eager mode and CUDA graph in hybrid for maximal performance and flexibility", + "required": false, + "type": "toggle", + "category": "System Settings" + }, + "MAX_SEQ_LEN_TO_CAPTURE": { + "value": "8192", + "title": "CUDA Graph Max Content Length", + "description": "Maximum context length covered by CUDA graphs. If a sequence has context length larger than this, we fall back to eager mode", + "required": false, + "type": "number", + "category": "System Settings" + }, + "DISABLE_CUSTOM_ALL_REDUCE": { + "value": "0", + "title": "Disable Custom All Reduce", + "description": "Enables or disables custom all reduce", + "required": false, + "type": "toggle", + "category": "System Settings" + } + } + ] + }, + { + "title": "Streaming Settings", + "settings": [ + { + "DEFAULT_BATCH_SIZE": { + "value": "50", + "title": "Default Final Batch Size", + "description": "Default and Maximum batch size for token streaming to reduce HTTP calls", + "required": false, + "type": "number", + "category": "Streaming Settings" + }, + "DEFAULT_MIN_BATCH_SIZE": { + "value": "1", + "title": "Default Starting Batch Size", + "description": "Batch size for the first request, which will be multiplied by the growth factor every subsequent request", + "required": false, + "type": "number", + "category": "Streaming Settings" + }, + "DEFAULT_BATCH_SIZE_GROWTH_FACTOR": { + "value": "3", + "title": "Default Batch Size Growth Factor", + "description": "Growth factor for dynamic batch size", + "required": false, + "type": "number", + "category": "Streaming Settings" + } + } + ] + }, + { + "title": "OpenAI Settings", + "settings": [ + { + "RAW_OPENAI_OUTPUT": { + "value": "1", + "title": "Raw OpenAI Output", + "description": "Raw OpenAI output instead of just the text", + "required": false, + "type": "toggle", + "category": "OpenAI Settings" + }, + "OPENAI_RESPONSE_ROLE": { + "value": "assistant", + "title": "OpenAI Response Role", + "description": "Role of the LLM's Response in OpenAI Chat Completions", + "required": false, + "type": "text", + "category": "OpenAI Settings" + }, + "OPENAI_SERVED_MODEL_NAME_OVERRIDE": { + "value": "", + "title": "OpenAI Served Model Name Override", + "description": "Overrides the name of the served model from model repo/path to specified name, which you will then be able to use the value for the `model` parameter when making OpenAI requests", + "required": false, + "type": "text", + "category": "OpenAI Settings" + } + } + ] + }, + { + "title": "Serverless Settings", + "settings": [ + { + "MAX_CONCURRENCY": { + "value": "300", + "title": "Max Concurrency", + "description": "Max concurrent requests per worker. vLLM has an internal queue, so you don't have to worry about limiting by VRAM, this is for improving scaling/load balancing efficiency", + "required": false, + "type": "number", + "category": "Serverless Settings" + }, + "DISABLE_LOG_STATS": { + "value": "1", + "title": "Disable Log Stats", + "description": "Enables or disables vLLM stats logging", + "required": false, + "type": "toggle", + "category": "Serverless Settings" + }, + "DISABLE_LOG_REQUESTS": { + "value": "1", + "title": "Disable Log Requests", + "description": "Enables or disables vLLM request logging", + "required": false, + "type": "toggle", + "category": "Serverless Settings" + } + } + ] + } + ] + } +} From a40e7803ee161f5d27d1c383b2822671e4a33436 Mon Sep 17 00:00:00 2001 From: carlson-svg Date: Sun, 18 Aug 2024 23:24:40 -0700 Subject: [PATCH 2/8] converted to human readable format --- worker-config.json | 1820 +++++++++++++++++++------------------------- 1 file changed, 793 insertions(+), 1027 deletions(-) diff --git a/worker-config.json b/worker-config.json index 4f96a78..e9e8e37 100644 --- a/worker-config.json +++ b/worker-config.json @@ -1,1032 +1,798 @@ { - "0.5.3": { - "categories": [ - { - "title": "LLM Settings", - "settings": [ - { - "TOKENIZER": { - "value": "", - "title": "Tokenizer", - "description": "Name or path of the Hugging Face tokenizer to use.", - "required": false, - "type": "text", - "category": "LLM Settings" - }, - "TOKENIZER_MODE": { - "value": "auto", - "title": "Tokenizer Mode", - "description": "The tokenizer mode.", - "required": false, - "type": "select", - "category": "LLM Settings", - "options": [ - { "value": "auto", "label": "auto" }, - { "value": "slow", "label": "slow" } - ] - }, - "SKIP_TOKENIZER_INIT": { - "value": false, - "title": "Skip Tokenizer Init", - "description": "Skip initialization of tokenizer and detokenizer.", - "required": false, - "type": "toggle", - "category": "LLM Settings" - }, - "TRUST_REMOTE_CODE": { - "value": false, - "title": "Trust Remote Code", - "description": "Trust remote code from Hugging Face.", - "required": false, - "type": "toggle", - "category": "LLM Settings" - }, - "DOWNLOAD_DIR": { - "value": "", - "title": "Download Directory", - "description": "Directory to download and load the weights.", - "required": false, - "type": "text", - "category": "LLM Settings" - }, - "LOAD_FORMAT": { - "value": "auto", - "title": "Load Format", - "description": "The format of the model weights to load.", - "required": false, - "type": "select", - "category": "LLM Settings", - "options": [ - { "value": "auto", "label": "auto" }, - { "value": "pt", "label": "pt" }, - { "value": "safetensors", "label": "safetensors" }, - { "value": "npcache", "label": "npcache" }, - { "value": "dummy", "label": "dummy" }, - { "value": "tensorizer", "label": "tensorizer" }, - { "value": "bitsandbytes", "label": "bitsandbytes" } - ] - }, - "DTYPE": { - "value": "auto", - "title": "Data Type", - "description": "Data type for model weights and activations.", - "required": false, - "type": "select", - "category": "LLM Settings", - "options": [ - { "value": "auto", "label": "auto" }, - { "value": "half", "label": "half" }, - { "value": "float16", "label": "float16" }, - { "value": "bfloat16", "label": "bfloat16" }, - { "value": "float", "label": "float" }, - { "value": "float32", "label": "float32" } - ] - }, - "KV_CACHE_DTYPE": { - "value": "auto", - "title": "KV Cache Data Type", - "description": "Data type for KV cache storage.", - "required": false, - "type": "select", - "category": "LLM Settings", - "options": [ - { "value": "auto", "label": "auto" }, - { "value": "fp8", "label": "fp8" } - ] - }, - "QUANTIZATION_PARAM_PATH": { - "value": "", - "title": "Quantization Param Path", - "description": "Path to the JSON file containing the KV cache scaling factors.", - "required": false, - "type": "text", - "category": "LLM Settings" - }, - "MAX_MODEL_LEN": { - "value": "", - "title": "Max Model Length", - "description": "Model context length.", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "GUIDED_DECODING_BACKEND": { - "value": "outlines", - "title": "Guided Decoding Backend", - "description": "Which engine will be used for guided decoding by default.", - "required": false, - "type": "select", - "category": "LLM Settings", - "options": [ - { "value": "outlines", "label": "outlines" }, - { "value": "lm-format-enforcer", "label": "lm-format-enforcer" } - ] - }, - "DISTRIBUTED_EXECUTOR_BACKEND": { - "value": "", - "title": "Distributed Executor Backend", - "description": "Backend to use for distributed serving.", - "required": false, - "type": "select", - "category": "LLM Settings", - "options": [ - { "value": "ray", "label": "ray" }, - { "value": "mp", "label": "mp" } - ] - }, - "WORKER_USE_RAY": { - "value": false, - "title": "Worker Use Ray", - "description": "Deprecated, use --distributed-executor-backend=ray.", - "required": false, - "type": "toggle", - "category": "LLM Settings" - }, - "RAY_WORKERS_USE_NSIGHT": { - "value": false, - "title": "Ray Workers Use Nsight", - "description": "If specified, use nsight to profile Ray workers.", - "required": false, - "type": "toggle", - "category": "LLM Settings" - }, - "PIPELINE_PARALLEL_SIZE": { - "value": 1, - "title": "Pipeline Parallel Size", - "description": "Number of pipeline stages.", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "TENSOR_PARALLEL_SIZE": { - "value": 1, - "title": "Tensor Parallel Size", - "description": "Number of tensor parallel replicas.", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "MAX_PARALLEL_LOADING_WORKERS": { - "value": "", - "title": "Max Parallel Loading Workers", - "description": "Load model sequentially in multiple batches.", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "ENABLE_PREFIX_CACHING": { - "value": false, - "title": "Enable Prefix Caching", - "description": "Enables automatic prefix caching.", - "required": false, - "type": "toggle", - "category": "LLM Settings" - }, - "DISABLE_SLIDING_WINDOW": { - "value": false, - "title": "Disable Sliding Window", - "description": "Disables sliding window, capping to sliding window size.", - "required": false, - "type": "toggle", - "category": "LLM Settings" - }, - "USE_V2_BLOCK_MANAGER": { - "value": false, - "title": "Use V2 Block Manager", - "description": "Use BlockSpaceMangerV2.", - "required": false, - "type": "toggle", - "category": "LLM Settings" - }, - "NUM_LOOKAHEAD_SLOTS": { - "value": 0, - "title": "Num Lookahead Slots", - "description": "Experimental scheduling config necessary for speculative decoding.", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "SEED": { - "value": 0, - "title": "Seed", - "description": "Random seed for operations.", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "NUM_GPU_BLOCKS_OVERRIDE": { - "value": "", - "title": "Num GPU Blocks Override", - "description": "If specified, ignore GPU profiling result and use this number of GPU blocks.", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "MAX_NUM_BATCHED_TOKENS": { - "value": "", - "title": "Max Num Batched Tokens", - "description": "Maximum number of batched tokens per iteration.", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "MAX_NUM_SEQS": { - "value": 256, - "title": "Max Num Seqs", - "description": "Maximum number of sequences per iteration.", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "MAX_LOGPROBS": { - "value": 20, - "title": "Max Logprobs", - "description": "Max number of log probs to return when logprobs is specified in SamplingParams.", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "DISABLE_LOG_STATS": { - "value": false, - "title": "Disable Log Stats", - "description": "Disable logging statistics.", - "required": false, - "type": "toggle", - "category": "LLM Settings" - }, - "QUANTIZATION": { - "value": "", - "title": "Quantization", - "description": "Method used to quantize the weights.", - "required": false, - "type": "select", - "category": "LLM Settings", - "options": [ - { "value": "None", "label": "None" }, - { "value": "awq", "label": "AWQ" }, - { "value": "squeezellm", "label": "SqueezeLLM" }, - { "value": "gptq", "label": "GPTQ" } - ] - }, - "ROPE_SCALING": { - "value": "", - "title": "RoPE Scaling", - "description": "RoPE scaling configuration in JSON format.", - "required": false, - "type": "text", - "category": "LLM Settings" - }, - "ROPE_THETA": { - "value": "", - "title": "RoPE Theta", - "description": "RoPE theta. Use with rope_scaling.", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "TOKENIZER_POOL_SIZE": { - "value": 0, - "title": "Tokenizer Pool Size", - "description": "Size of tokenizer pool to use for asynchronous tokenization.", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "TOKENIZER_POOL_TYPE": { - "value": "ray", - "title": "Tokenizer Pool Type", - "description": "Type of tokenizer pool to use for asynchronous tokenization.", - "required": false, - "type": "text", - "category": "LLM Settings" - }, - "TOKENIZER_POOL_EXTRA_CONFIG": { - "value": "", - "title": "Tokenizer Pool Extra Config", - "description": "Extra config for tokenizer pool.", - "required": false, - "type": "text", - "category": "LLM Settings" - }, - "ENABLE_LORA": { - "value": false, - "title": "Enable LoRA", - "description": "If True, enable handling of LoRA adapters.", - "required": false, - "type": "toggle", - "category": "LLM Settings" - }, - "MAX_LORAS": { - "value": 1, - "title": "Max LoRAs", - "description": "Max number of LoRAs in a single batch.", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "MAX_LORA_RANK": { - "value": 16, - "title": "Max LoRA Rank", - "description": "Max LoRA rank.", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "LORA_EXTRA_VOCAB_SIZE": { - "value": 256, - "title": "LoRA Extra Vocab Size", - "description": "Maximum size of extra vocabulary for LoRA adapters.", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "LORA_DTYPE": { - "value": "auto", - "title": "LoRA Data Type", - "description": "Data type for LoRA.", - "required": false, - "type": "select", - "category": "LLM Settings", - "options": [ - { "value": "auto", "label": "auto" }, - { "value": "float16", "label": "float16" }, - { "value": "bfloat16", "label": "bfloat16" }, - { "value": "float32", "label": "float32" } - ] - }, - "LONG_LORA_SCALING_FACTORS": { - "value": "", - "title": "Long LoRA Scaling Factors", - "description": "Specify multiple scaling factors for LoRA adapters.", - "required": false, - "type": "text", - "category": "LLM Settings" - }, - "MAX_CPU_LORAS": { - "value": "", - "title": "Max CPU LoRAs", - "description": "Maximum number of LoRAs to store in CPU memory.", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "FULLY_SHARDED_LORAS": { - "value": false, - "title": "Fully Sharded LoRAs", - "description": "Enable fully sharded LoRA layers.", - "required": false, - "type": "toggle", - "category": "LLM Settings" - }, - "DEVICE": { - "value": "auto", - "title": "Device", - "description": "Device type for vLLM execution.", - "required": false, - "type": "select", - "category": "LLM Settings", - "options": [ - { "value": "auto", "label": "auto" }, - { "value": "cuda", "label": "cuda" }, - { "value": "neuron", "label": "neuron" }, - { "value": "cpu", "label": "cpu" }, - { "value": "openvino", "label": "openvino" }, - { "value": "tpu", "label": "tpu" }, - { "value": "xpu", "label": "xpu" } - ] - }, - "SCHEDULER_DELAY_FACTOR": { - "value": 0.0, - "title": "Scheduler Delay Factor", - "description": "Apply a delay before scheduling next prompt.", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "ENABLE_CHUNKED_PREFILL": { - "value": false, - "title": "Enable Chunked Prefill", - "description": "Enable chunked prefill requests.", - "required": false, - "type": "toggle", - "category": "LLM Settings" - }, - "SPECULATIVE_MODEL": { - "value": "", - "title": "Speculative Model", - "description": "The name of the draft model to be used in speculative decoding.", - "required": false, - "type": "text", - "category": "LLM Settings" - }, - "NUM_SPECULATIVE_TOKENS": { - "value": "", - "title": "Num Speculative Tokens", - "description": "The number of speculative tokens to sample from the draft model.", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE": { - "value": "", - "title": "Speculative Draft Tensor Parallel Size", - "description": "Number of tensor parallel replicas for the draft model.", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "SPECULATIVE_MAX_MODEL_LEN": { - "value": "", - "title": "Speculative Max Model Length", - "description": "The maximum sequence length supported by the draft model.", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "SPECULATIVE_DISABLE_BY_BATCH_SIZE": { - "value": "", - "title": "Speculative Disable by Batch Size", - "description": "Disable speculative decoding if the number of enqueue requests is larger than this value.", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "NGRAM_PROMPT_LOOKUP_MAX": { - "value": "", - "title": "Ngram Prompt Lookup Max", - "description": "Max size of window for ngram prompt lookup in speculative decoding.", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "NGRAM_PROMPT_LOOKUP_MIN": { - "value": "", - "title": "Ngram Prompt Lookup Min", - "description": "Min size of window for ngram prompt lookup in speculative decoding.", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "SPEC_DECODING_ACCEPTANCE_METHOD": { - "value": "rejection_sampler", - "title": "Speculative Decoding Acceptance Method", - "description": "Specify the acceptance method for draft token verification in speculative decoding.", - "required": false, - "type": "select", - "category": "LLM Settings", - "options": [ - { "value": "rejection_sampler", "label": "rejection_sampler" }, - { "value": "typical_acceptance_sampler", "label": "typical_acceptance_sampler" } - ] - }, - "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD": { - "value": "", - "title": "Typical Acceptance Sampler Posterior Threshold", - "description": "Set the lower bound threshold for the posterior probability of a token to be accepted.", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA": { - "value": "", - "title": "Typical Acceptance Sampler Posterior Alpha", - "description": "A scaling factor for the entropy-based threshold for token acceptance.", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "MODEL_LOADER_EXTRA_CONFIG": { - "value": "", - "title": "Model Loader Extra Config", - "description": "Extra config for model loader.", - "required": false, - "type": "text", - "category": "LLM Settings" - }, - "PREEMPTION_MODE": { - "value": "", - "title": "Preemption Mode", - "description": "If 'recompute', the engine performs preemption-aware recomputation. If 'save', the engine saves activations into the CPU memory as preemption happens.", - "required": false, - "type": "text", - "category": "LLM Settings" - }, - "PREEMPTION_CHECK_PERIOD": { - "value": 1.0, - "title": "Preemption Check Period", - "description": "How frequently the engine checks if a preemption happens.", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "PREEMPTION_CPU_CAPACITY": { - "value": 2, - "title": "Preemption CPU Capacity", - "description": "The percentage of CPU memory used for the saved activations.", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "MAX_LOG_LEN": { - "value": "", - "title": "Max Log Length", - "description": "Max number of characters or ID numbers being printed in log.", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "DISABLE_LOGGING_REQUEST": { - "value": false, - "title": "Disable Logging Request", - "description": "Disable logging requests.", - "required": false, - "type": "toggle", - "category": "LLM Settings" - } - } - ] - }, - { - "title": "Tokenizer Settings", - "settings": [ - { - "TOKENIZER_NAME": { - "value": "", - "title": "Tokenizer Name", - "description": "Tokenizer repo to use a different tokenizer than the model's default", - "required": false, - "type": "text", - "category": "Tokenizer Settings" - }, - "TOKENIZER_REVISION": { - "value": "", - "title": "Tokenizer Revision", - "description": "Tokenizer revision to load", - "required": false, - "type": "text", - "category": "Tokenizer Settings" - }, - "CUSTOM_CHAT_TEMPLATE": { - "value": "", - "title": "Custom Chat Template", - "description": "Custom chat jinja template", - "required": false, - "type": "text", - "category": "Tokenizer Settings" - } - } - ] - }, - { - "title": "System Settings", - "settings": [ - { - "GPU_MEMORY_UTILIZATION": { - "value": "0.95", - "title": "GPU Memory Utilization", - "description": "Sets GPU VRAM utilization", - "required": false, - "type": "number", - "category": "System Settings" - }, - "MAX_PARALLEL_LOADING_WORKERS": { - "value": "", - "title": "Max Parallel Loading Workers", - "description": "Load model sequentially in multiple batches. Leave empty for auto", - "required": false, - "type": "number", - "category": "System Settings" - }, - "BLOCK_SIZE": { - "value": "16", - "title": "Block Size", - "description": "Token block size for contiguous chunks of tokens", - "required": false, - "type": "number", - "category": "System Settings" - }, - "SWAP_SPACE": { - "value": "4", - "title": "Swap Space", - "description": "CPU swap space size (GiB) per GPU", - "required": false, - "type": "number", - "category": "System Settings" - }, - "ENFORCE_EAGER": { - "value": false, - "title": "Enforce Eager", - "description": "Always use eager-mode PyTorch. If False (0), will use eager mode and CUDA graph in hybrid for maximal performance and flexibility", - "required": false, - "type": "toggle", - "category": "System Settings" - }, - "MAX_SEQ_LEN_TO_CAPTURE": { - "value": "8192", - "title": "CUDA Graph Max Content Length", - "description": "Maximum context length covered by CUDA graphs. If a sequence has context length larger than this, we fall back to eager mode", - "required": false, - "type": "number", - "category": "System Settings" - }, - "DISABLE_CUSTOM_ALL_REDUCE": { - "value": false, - "title": "Disable Custom All Reduce", - "description": "Enables or disables custom all reduce", - "required": false, - "type": "toggle", - "category": "System Settings" - } - } - ] - }, - { - "title": "Streaming Settings", - "settings": [ - { - "DEFAULT_BATCH_SIZE": { - "value": "50", - "title": "Default Final Batch Size", - "description": "Default and Maximum batch size for token streaming to reduce HTTP calls", - "required": false, - "type": "number", - "category": "Streaming Settings" - }, - "DEFAULT_MIN_BATCH_SIZE": { - "value": "1", - "title": "Default Starting Batch Size", - "description": "Batch size for the first request, which will be multiplied by the growth factor every subsequent request", - "required": false, - "type": "number", - "category": "Streaming Settings" - }, - "DEFAULT_BATCH_SIZE_GROWTH_FACTOR": { - "value": "3", - "title": "Default Batch Size Growth Factor", - "description": "Growth factor for dynamic batch size", - "required": false, - "type": "number", - "category": "Streaming Settings" - } - } - ] - }, - { - "title": "OpenAI Settings", - "settings": [ - { - "RAW_OPENAI_OUTPUT": { - "value": true, - "title": "Raw OpenAI Output", - "description": "Raw OpenAI output instead of just the text", - "required": false, - "type": "toggle", - "category": "OpenAI Settings" - }, - "OPENAI_RESPONSE_ROLE": { - "value": "assistant", - "title": "OpenAI Response Role", - "description": "Role of the LLM's Response in OpenAI Chat Completions", - "required": false, - "type": "text", - "category": "OpenAI Settings" - }, - "OPENAI_SERVED_MODEL_NAME_OVERRIDE": { - "value": "", - "title": "OpenAI Served Model Name Override", - "description": "Overrides the name of the served model from model repo/path to specified name, which you will then be able to use the value for the `model` parameter when making OpenAI requests", - "required": false, - "type": "text", - "category": "OpenAI Settings" - } - } - ] - }, - { - "title": "Serverless Settings", - "settings": [ - { - "MAX_CONCURRENCY": { - "value": "300", - "title": "Max Concurrency", - "description": "Max concurrent requests per worker. vLLM has an internal queue, so you don't have to worry about limiting by VRAM, this is for improving scaling/load balancing efficiency", - "required": false, - "type": "number", - "category": "Serverless Settings" - }, - "DISABLE_LOG_STATS": { - "value": true, - "title": "Disable Log Stats", - "description": "Enables or disables vLLM stats logging", - "required": false, - "type": "toggle", - "category": "Serverless Settings" - }, - "DISABLE_LOG_REQUESTS": { - "value": true, - "title": "Disable Log Requests", - "description": "Enables or disables vLLM request logging", - "required": false, - "type": "toggle", - "category": "Serverless Settings" - } - } - ] - } + "versions": { + "0.5.3": { + "categories": [ + { + "title": "LLM Settings", + "settings": [ + "TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE", + "DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH", + "MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND", + "WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE", + "TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING", + "DISABLE_SLIDING_WINDOW", "USE_V2_BLOCK_MANAGER", "NUM_LOOKAHEAD_SLOTS", + "SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS", + "MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA", + "TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG", + "ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE", + "LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS", + "DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL", + "NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE", + "SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE", + "NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD", + "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA", + "MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD", + "PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST" + ] + }, + { + "title": "Tokenizer Settings", + "settings": [ + "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE" + ] + }, + { + "title": "System Settings", + "settings": [ + "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE", + "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE" + ] + }, + { + "title": "Streaming Settings", + "settings": [ + "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR" + ] + }, + { + "title": "OpenAI Settings", + "settings": [ + "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE" + ] + }, + { + "title": "Serverless Settings", + "settings": [ + "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS" + ] + } + ] + }, + "0.4.2": { + "categories": [ + { + "title": "LLM Settings", + "settings": [ + "MODEL_REVISION", "MAX_MODEL_LEN", "BASE_PATH", "LOAD_FORMAT", "QUANTIZATION", + "TRUST_REMOTE_CODE", "SEED", "KV_CACHE_DTYPE", "DTYPE" + ] + }, + { + "title": "Tokenizer Settings", + "settings": [ + "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE" + ] + }, + { + "title": "System Settings", + "settings": [ + "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE", + "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE" + ] + }, + { + "title": "Streaming Settings", + "settings": [ + "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR" + ] + }, + { + "title": "OpenAI Settings", + "settings": [ + "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE" + ] + }, + { + "title": "Serverless Settings", + "settings": [ + "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS" + ] + } + ] + }, + "0.3.1": { + "categories": [ + { + "title": "LLM Settings", + "settings": [ + "TOKENIZER", "TRUST_REMOTE_CODE" + ] + } + ] + } + }, + "schema": { + "TOKENIZER": { + "env_var_name": "TOKENIZER", + "value": "", + "title": "Tokenizer", + "description": "Name or path of the Hugging Face tokenizer to use.", + "required": false, + "type": "text" + }, + "TOKENIZER_MODE": { + "env_var_name": "TOKENIZER_MODE", + "value": "auto", + "title": "Tokenizer Mode", + "description": "The tokenizer mode.", + "required": false, + "type": "select", + "options": [ + { "value": "auto", "label": "auto" }, + { "value": "slow", "label": "slow" } + ] + }, + "SKIP_TOKENIZER_INIT": { + "env_var_name": "SKIP_TOKENIZER_INIT", + "value": false, + "title": "Skip Tokenizer Init", + "description": "Skip initialization of tokenizer and detokenizer.", + "required": false, + "type": "toggle" + }, + "TRUST_REMOTE_CODE": { + "env_var_name": "TRUST_REMOTE_CODE", + "value": false, + "title": "Trust Remote Code", + "description": "Trust remote code from Hugging Face.", + "required": false, + "type": "toggle" + }, + "DOWNLOAD_DIR": { + "env_var_name": "DOWNLOAD_DIR", + "value": "", + "title": "Download Directory", + "description": "Directory to download and load the weights.", + "required": false, + "type": "text" + }, + "LOAD_FORMAT": { + "env_var_name": "LOAD_FORMAT", + "value": "auto", + "title": "Load Format", + "description": "The format of the model weights to load.", + "required": false, + "type": "select", + "options": [ + { "value": "auto", "label": "auto" }, + { "value": "pt", "label": "pt" }, + { "value": "safetensors", "label": "safetensors" }, + { "value": "npcache", "label": "npcache" }, + { "value": "dummy", "label": "dummy" }, + { "value": "tensorizer", "label": "tensorizer" }, + { "value": "bitsandbytes", "label": "bitsandbytes" } + ] + }, + "DTYPE": { + "env_var_name": "DTYPE", + "value": "auto", + "title": "Data Type", + "description": "Data type for model weights and activations.", + "required": false, + "type": "select", + "options": [ + { "value": "auto", "label": "auto" }, + { "value": "half", "label": "half" }, + { "value": "float16", "label": "float16" }, + { "value": "bfloat16", "label": "bfloat16" }, + { "value": "float", "label": "float" }, + { "value": "float32", "label": "float32" } + ] + }, + "KV_CACHE_DTYPE": { + "env_var_name": "KV_CACHE_DTYPE", + "value": "auto", + "title": "KV Cache Data Type", + "description": "Data type for KV cache storage.", + "required": false, + "type": "select", + "options": [ + { "value": "auto", "label": "auto" }, + { "value": "fp8", "label": "fp8" } + ] + }, + "QUANTIZATION_PARAM_PATH": { + "env_var_name": "QUANTIZATION_PARAM_PATH", + "value": "", + "title": "Quantization Param Path", + "description": "Path to the JSON file containing the KV cache scaling factors.", + "required": false, + "type": "text" + }, + "MAX_MODEL_LEN": { + "env_var_name": "MAX_MODEL_LEN", + "value": "", + "title": "Max Model Length", + "description": "Model context length.", + "required": false, + "type": "number" + }, + "GUIDED_DECODING_BACKEND": { + "env_var_name": "GUIDED_DECODING_BACKEND", + "value": "outlines", + "title": "Guided Decoding Backend", + "description": "Which engine will be used for guided decoding by default.", + "required": false, + "type": "select", + "options": [ + { "value": "outlines", "label": "outlines" }, + { "value": "lm-format-enforcer", "label": "lm-format-enforcer" } + ] + }, + "DISTRIBUTED_EXECUTOR_BACKEND": { + "env_var_name": "DISTRIBUTED_EXECUTOR_BACKEND", + "value": "", + "title": "Distributed Executor Backend", + "description": "Backend to use for distributed serving.", + "required": false, + "type": "select", + "options": [ + { "value": "ray", "label": "ray" }, + { "value": "mp", "label": "mp" } + ] + }, + "WORKER_USE_RAY": { + "env_var_name": "WORKER_USE_RAY", + "value": false, + "title": "Worker Use Ray", + "description": "Deprecated, use --distributed-executor-backend=ray.", + "required": false, + "type": "toggle" + }, + "RAY_WORKERS_USE_NSIGHT": { + "env_var_name": "RAY_WORKERS_USE_NSIGHT", + "value": false, + "title": "Ray Workers Use Nsight", + "description": "If specified, use nsight to profile Ray workers.", + "required": false, + "type": "toggle" + }, + "PIPELINE_PARALLEL_SIZE": { + "env_var_name": "PIPELINE_PARALLEL_SIZE", + "value": 1, + "title": "Pipeline Parallel Size", + "description": "Number of pipeline stages.", + "required": false, + "type": "number" + }, + "TENSOR_PARALLEL_SIZE": { + "env_var_name": "TENSOR_PARALLEL_SIZE", + "value": 1, + "title": "Tensor Parallel Size", + "description": "Number of tensor parallel replicas.", + "required": false, + "type": "number" + }, + "MAX_PARALLEL_LOADING_WORKERS": { + "env_var_name": "MAX_PARALLEL_LOADING_WORKERS", + "value": "", + "title": "Max Parallel Loading Workers", + "description": "Load model sequentially in multiple batches.", + "required": false, + "type": "number" + }, + "ENABLE_PREFIX_CACHING": { + "env_var_name": "ENABLE_PREFIX_CACHING", + "value": false, + "title": "Enable Prefix Caching", + "description": "Enables automatic prefix caching.", + "required": false, + "type": "toggle" + }, + "DISABLE_SLIDING_WINDOW": { + "env_var_name": "DISABLE_SLIDING_WINDOW", + "value": false, + "title": "Disable Sliding Window", + "description": "Disables sliding window, capping to sliding window size.", + "required": false, + "type": "toggle" + }, + "USE_V2_BLOCK_MANAGER": { + "env_var_name": "USE_V2_BLOCK_MANAGER", + "value": false, + "title": "Use V2 Block Manager", + "description": "Use BlockSpaceMangerV2.", + "required": false, + "type": "toggle" + }, + "NUM_LOOKAHEAD_SLOTS": { + "env_var_name": "NUM_LOOKAHEAD_SLOTS", + "value": 0, + "title": "Num Lookahead Slots", + "description": "Experimental scheduling config necessary for speculative decoding.", + "required": false, + "type": "number" + }, + "SEED": { + "env_var_name": "SEED", + "value": 0, + "title": "Seed", + "description": "Random seed for operations.", + "required": false, + "type": "number" + }, + "NUM_GPU_BLOCKS_OVERRIDE": { + "env_var_name": "NUM_GPU_BLOCKS_OVERRIDE", + "value": "", + "title": "Num GPU Blocks Override", + "description": "If specified, ignore GPU profiling result and use this number of GPU blocks.", + "required": false, + "type": "number" + }, + "MAX_NUM_BATCHED_TOKENS": { + "env_var_name": "MAX_NUM_BATCHED_TOKENS", + "value": "", + "title": "Max Num Batched Tokens", + "description": "Maximum number of batched tokens per iteration.", + "required": false, + "type": "number" + }, + "MAX_NUM_SEQS": { + "env_var_name": "MAX_NUM_SEQS", + "value": 256, + "title": "Max Num Seqs", + "description": "Maximum number of sequences per iteration.", + "required": false, + "type": "number" + }, + "MAX_LOGPROBS": { + "env_var_name": "MAX_LOGPROBS", + "value": 20, + "title": "Max Logprobs", + "description": "Max number of log probs to return when logprobs is specified in SamplingParams.", + "required": false, + "type": "number" + }, + "DISABLE_LOG_STATS": { + "env_var_name": "DISABLE_LOG_STATS", + "value": false, + "title": "Disable Log Stats", + "description": "Disable logging statistics.", + "required": false, + "type": "toggle" + }, + "QUANTIZATION": { + "env_var_name": "QUANTIZATION", + "value": "", + "title": "Quantization", + "description": "Method used to quantize the weights.", + "required": false, + "type": "select", + "options": [ + { "value": "None", "label": "None" }, + { "value": "awq", "label": "AWQ" }, + { "value": "squeezellm", "label": "SqueezeLLM" }, + { "value": "gptq", "label": "GPTQ" } + ] + }, + "ROPE_SCALING": { + "env_var_name": "ROPE_SCALING", + "value": "", + "title": "RoPE Scaling", + "description": "RoPE scaling configuration in JSON format.", + "required": false, + "type": "text" + }, + "ROPE_THETA": { + "env_var_name": "ROPE_THETA", + "value": "", + "title": "RoPE Theta", + "description": "RoPE theta. Use with rope_scaling.", + "required": false, + "type": "number" + }, + "TOKENIZER_POOL_SIZE": { + "env_var_name": "TOKENIZER_POOL_SIZE", + "value": 0, + "title": "Tokenizer Pool Size", + "description": "Size of tokenizer pool to use for asynchronous tokenization.", + "required": false, + "type": "number" + }, + "TOKENIZER_POOL_TYPE": { + "env_var_name": "TOKENIZER_POOL_TYPE", + "value": "ray", + "title": "Tokenizer Pool Type", + "description": "Type of tokenizer pool to use for asynchronous tokenization.", + "required": false, + "type": "text" + }, + "TOKENIZER_POOL_EXTRA_CONFIG": { + "env_var_name": "TOKENIZER_POOL_EXTRA_CONFIG", + "value": "", + "title": "Tokenizer Pool Extra Config", + "description": "Extra config for tokenizer pool.", + "required": false, + "type": "text" + }, + "ENABLE_LORA": { + "env_var_name": "ENABLE_LORA", + "value": false, + "title": "Enable LoRA", + "description": "If True, enable handling of LoRA adapters.", + "required": false, + "type": "toggle" + }, + "MAX_LORAS": { + "env_var_name": "MAX_LORAS", + "value": 1, + "title": "Max LoRAs", + "description": "Max number of LoRAs in a single batch.", + "required": false, + "type": "number" + }, + "MAX_LORA_RANK": { + "env_var_name": "MAX_LORA_RANK", + "value": 16, + "title": "Max LoRA Rank", + "description": "Max LoRA rank.", + "required": false, + "type": "number" + }, + "LORA_EXTRA_VOCAB_SIZE": { + "env_var_name": "LORA_EXTRA_VOCAB_SIZE", + "value": 256, + "title": "LoRA Extra Vocab Size", + "description": "Maximum size of extra vocabulary for LoRA adapters.", + "required": false, + "type": "number" + }, + "LORA_DTYPE": { + "env_var_name": "LORA_DTYPE", + "value": "auto", + "title": "LoRA Data Type", + "description": "Data type for LoRA.", + "required": false, + "type": "select", + "options": [ + { "value": "auto", "label": "auto" }, + { "value": "float16", "label": "float16" }, + { "value": "bfloat16", "label": "bfloat16" }, + { "value": "float32", "label": "float32" } + ] + }, + "LONG_LORA_SCALING_FACTORS": { + "env_var_name": "LONG_LORA_SCALING_FACTORS", + "value": "", + "title": "Long LoRA Scaling Factors", + "description": "Specify multiple scaling factors for LoRA adapters.", + "required": false, + "type": "text" + }, + "MAX_CPU_LORAS": { + "env_var_name": "MAX_CPU_LORAS", + "value": "", + "title": "Max CPU LoRAs", + "description": "Maximum number of LoRAs to store in CPU memory.", + "required": false, + "type": "number" + }, + "FULLY_SHARDED_LORAS": { + "env_var_name": "FULLY_SHARDED_LORAS", + "value": false, + "title": "Fully Sharded LoRAs", + "description": "Enable fully sharded LoRA layers.", + "required": false, + "type": "toggle" + }, + "DEVICE": { + "env_var_name": "DEVICE", + "value": "auto", + "title": "Device", + "description": "Device type for vLLM execution.", + "required": false, + "type": "select", + "options": [ + { "value": "auto", "label": "auto" }, + { "value": "cuda", "label": "cuda" }, + { "value": "neuron", "label": "neuron" }, + { "value": "cpu", "label": "cpu" }, + { "value": "openvino", "label": "openvino" }, + { "value": "tpu", "label": "tpu" }, + { "value": "xpu", "label": "xpu" } ] }, - "0.4.2": { - "categories": [ - { - "title": "LLM Settings", - "settings": [ - { - "MODEL_REVISION": { - "value": "", - "title": "Model Revision", - "description": "Model revision (branch) to load", - "required": false, - "type": "text", - "category": "LLM Settings" - }, - "MAX_MODEL_LEN": { - "value": "", - "title": "Max Model Length", - "description": "Maximum number of tokens for the engine to handle per request", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "BASE_PATH": { - "value": "/runpod-volume", - "title": "Base Path", - "description": "Storage directory for Huggingface cache and model", - "required": false, - "type": "text", - "category": "LLM Settings" - }, - "LOAD_FORMAT": { - "value": "auto", - "title": "Load Format", - "description": "Format to load model in", - "required": false, - "type": "select", - "category": "LLM Settings", - "options": [ - { "value": "auto", "label": "auto" }, - { "value": ".safetensors", "label": ".safetensors" }, - { "value": ".bin", "label": ".bin" }, - { "value": ".pt", "label": ".pt" } - ] - }, - "QUANTIZATION": { - "value": "None", - "title": "Quantization", - "description": "Quantization of given model. The model must already be quantized", - "required": false, - "type": "select", - "category": "LLM Settings", - "options": [ - { "value": "None", "label": "None" }, - { "value": "awq", "label": "AWQ" }, - { "value": "squeezellm", "label": "SqueezeLLM" }, - { "value": "gptq", "label": "GPTQ" } - ] - }, - "TRUST_REMOTE_CODE": { - "value": "0", - "title": "Trust Remote Code", - "description": "Trust remote code for HuggingFace models", - "required": false, - "type": "toggle", - "category": "LLM Settings" - }, - "SEED": { - "value": "", - "title": "Seed", - "description": "Sets random seed for operations", - "required": false, - "type": "number", - "category": "LLM Settings" - }, - "KV_CACHE_DTYPE": { - "value": "auto", - "title": "KV Cache Data Type", - "description": "Data type for kv cache storage. Uses DTYPE if set to auto", - "required": false, - "type": "select", - "category": "LLM Settings", - "options": [ - { "value": "auto", "label": "auto" }, - { "value": "fp8_e5m2", "label": "fp8_e5m2" } - ] - }, - "DTYPE": { - "value": "auto", - "title": "Weights Datatype/Precision", - "description": "Sets datatype/precision for model weights and activations", - "required": false, - "type": "select", - "category": "LLM Settings", - "options": [ - { "value": "auto", "label": "auto" }, - { "value": "half", "label": "half" }, - { "value": "float16", "label": "float16" }, - { "value": "bfloat16", "label": "bfloat16" }, - { "value": "float", "label": "float" }, - { "value": "float32", "label": "float32" } - ] - } - } - ] - }, - { - "title": "Tokenizer Settings", - "settings": [ - { - "TOKENIZER_NAME": { - "value": "", - "title": "Tokenizer Name", - "description": "Tokenizer repo to use a different tokenizer than the model's default", - "required": false, - "type": "text", - "category": "Tokenizer Settings" - }, - "TOKENIZER_REVISION": { - "value": "", - "title": "Tokenizer Revision", - "description": "Tokenizer revision to load", - "required": false, - "type": "text", - "category": "Tokenizer Settings" - }, - "CUSTOM_CHAT_TEMPLATE": { - "value": "", - "title": "Custom Chat Template", - "description": "Custom chat jinja template", - "required": false, - "type": "text", - "category": "Tokenizer Settings" - } - } - ] - }, - { - "title": "System Settings", - "settings": [ - { - "GPU_MEMORY_UTILIZATION": { - "value": "0.95", - "title": "GPU Memory Utilization", - "description": "Sets GPU VRAM utilization", - "required": false, - "type": "number", - "category": "System Settings" - }, - "MAX_PARALLEL_LOADING_WORKERS": { - "value": "", - "title": "Max Parallel Loading Workers", - "description": "Load model sequentially in multiple batches. Leave empty for auto", - "required": false, - "type": "number", - "category": "System Settings" - }, - "BLOCK_SIZE": { - "value": "16", - "title": "Block Size", - "description": "Token block size for contiguous chunks of tokens", - "required": false, - "type": "number", - "category": "System Settings" - }, - "SWAP_SPACE": { - "value": "4", - "title": "Swap Space", - "description": "CPU swap space size (GiB) per GPU", - "required": false, - "type": "number", - "category": "System Settings" - }, - "ENFORCE_EAGER": { - "value": "0", - "title": "Enforce Eager", - "description": "Always use eager-mode PyTorch. If False (0), will use eager mode and CUDA graph in hybrid for maximal performance and flexibility", - "required": false, - "type": "toggle", - "category": "System Settings" - }, - "MAX_SEQ_LEN_TO_CAPTURE": { - "value": "8192", - "title": "CUDA Graph Max Content Length", - "description": "Maximum context length covered by CUDA graphs. If a sequence has context length larger than this, we fall back to eager mode", - "required": false, - "type": "number", - "category": "System Settings" - }, - "DISABLE_CUSTOM_ALL_REDUCE": { - "value": "0", - "title": "Disable Custom All Reduce", - "description": "Enables or disables custom all reduce", - "required": false, - "type": "toggle", - "category": "System Settings" - } - } - ] - }, - { - "title": "Streaming Settings", - "settings": [ - { - "DEFAULT_BATCH_SIZE": { - "value": "50", - "title": "Default Final Batch Size", - "description": "Default and Maximum batch size for token streaming to reduce HTTP calls", - "required": false, - "type": "number", - "category": "Streaming Settings" - }, - "DEFAULT_MIN_BATCH_SIZE": { - "value": "1", - "title": "Default Starting Batch Size", - "description": "Batch size for the first request, which will be multiplied by the growth factor every subsequent request", - "required": false, - "type": "number", - "category": "Streaming Settings" - }, - "DEFAULT_BATCH_SIZE_GROWTH_FACTOR": { - "value": "3", - "title": "Default Batch Size Growth Factor", - "description": "Growth factor for dynamic batch size", - "required": false, - "type": "number", - "category": "Streaming Settings" - } - } - ] - }, - { - "title": "OpenAI Settings", - "settings": [ - { - "RAW_OPENAI_OUTPUT": { - "value": "1", - "title": "Raw OpenAI Output", - "description": "Raw OpenAI output instead of just the text", - "required": false, - "type": "toggle", - "category": "OpenAI Settings" - }, - "OPENAI_RESPONSE_ROLE": { - "value": "assistant", - "title": "OpenAI Response Role", - "description": "Role of the LLM's Response in OpenAI Chat Completions", - "required": false, - "type": "text", - "category": "OpenAI Settings" - }, - "OPENAI_SERVED_MODEL_NAME_OVERRIDE": { - "value": "", - "title": "OpenAI Served Model Name Override", - "description": "Overrides the name of the served model from model repo/path to specified name, which you will then be able to use the value for the `model` parameter when making OpenAI requests", - "required": false, - "type": "text", - "category": "OpenAI Settings" - } - } - ] - }, - { - "title": "Serverless Settings", - "settings": [ - { - "MAX_CONCURRENCY": { - "value": "300", - "title": "Max Concurrency", - "description": "Max concurrent requests per worker. vLLM has an internal queue, so you don't have to worry about limiting by VRAM, this is for improving scaling/load balancing efficiency", - "required": false, - "type": "number", - "category": "Serverless Settings" - }, - "DISABLE_LOG_STATS": { - "value": "1", - "title": "Disable Log Stats", - "description": "Enables or disables vLLM stats logging", - "required": false, - "type": "toggle", - "category": "Serverless Settings" - }, - "DISABLE_LOG_REQUESTS": { - "value": "1", - "title": "Disable Log Requests", - "description": "Enables or disables vLLM request logging", - "required": false, - "type": "toggle", - "category": "Serverless Settings" - } - } - ] - } + "SCHEDULER_DELAY_FACTOR": { + "env_var_name": "SCHEDULER_DELAY_FACTOR", + "value": 0.0, + "title": "Scheduler Delay Factor", + "description": "Apply a delay before scheduling next prompt.", + "required": false, + "type": "number" + }, + "ENABLE_CHUNKED_PREFILL": { + "env_var_name": "ENABLE_CHUNKED_PREFILL", + "value": false, + "title": "Enable Chunked Prefill", + "description": "Enable chunked prefill requests.", + "required": false, + "type": "toggle" + }, + "SPECULATIVE_MODEL": { + "env_var_name": "SPECULATIVE_MODEL", + "value": "", + "title": "Speculative Model", + "description": "The name of the draft model to be used in speculative decoding.", + "required": false, + "type": "text" + }, + "NUM_SPECULATIVE_TOKENS": { + "env_var_name": "NUM_SPECULATIVE_TOKENS", + "value": "", + "title": "Num Speculative Tokens", + "description": "The number of speculative tokens to sample from the draft model.", + "required": false, + "type": "number" + }, + "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE": { + "env_var_name": "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE", + "value": "", + "title": "Speculative Draft Tensor Parallel Size", + "description": "Number of tensor parallel replicas for the draft model.", + "required": false, + "type": "number" + }, + "SPECULATIVE_MAX_MODEL_LEN": { + "env_var_name": "SPECULATIVE_MAX_MODEL_LEN", + "value": "", + "title": "Speculative Max Model Length", + "description": "The maximum sequence length supported by the draft model.", + "required": false, + "type": "number" + }, + "SPECULATIVE_DISABLE_BY_BATCH_SIZE": { + "env_var_name": "SPECULATIVE_DISABLE_BY_BATCH_SIZE", + "value": "", + "title": "Speculative Disable by Batch Size", + "description": "Disable speculative decoding if the number of enqueue requests is larger than this value.", + "required": false, + "type": "number" + }, + "NGRAM_PROMPT_LOOKUP_MAX": { + "env_var_name": "NGRAM_PROMPT_LOOKUP_MAX", + "value": "", + "title": "Ngram Prompt Lookup Max", + "description": "Max size of window for ngram prompt lookup in speculative decoding.", + "required": false, + "type": "number" + }, + "NGRAM_PROMPT_LOOKUP_MIN": { + "env_var_name": "NGRAM_PROMPT_LOOKUP_MIN", + "value": "", + "title": "Ngram Prompt Lookup Min", + "description": "Min size of window for ngram prompt lookup in speculative decoding.", + "required": false, + "type": "number" + }, + "SPEC_DECODING_ACCEPTANCE_METHOD": { + "env_var_name": "SPEC_DECODING_ACCEPTANCE_METHOD", + "value": "rejection_sampler", + "title": "Speculative Decoding Acceptance Method", + "description": "Specify the acceptance method for draft token verification in speculative decoding.", + "required": false, + "type": "select", + "options": [ + { "value": "rejection_sampler", "label": "rejection_sampler" }, + { "value": "typical_acceptance_sampler", "label": "typical_acceptance_sampler" } ] + }, + "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD": { + "env_var_name": "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", + "value": "", + "title": "Typical Acceptance Sampler Posterior Threshold", + "description": "Set the lower bound threshold for the posterior probability of a token to be accepted.", + "required": false, + "type": "number" + }, + "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA": { + "env_var_name": "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA", + "value": "", + "title": "Typical Acceptance Sampler Posterior Alpha", + "description": "A scaling factor for the entropy-based threshold for token acceptance.", + "required": false, + "type": "number" + }, + "MODEL_LOADER_EXTRA_CONFIG": { + "env_var_name": "MODEL_LOADER_EXTRA_CONFIG", + "value": "", + "title": "Model Loader Extra Config", + "description": "Extra config for model loader.", + "required": false, + "type": "text" + }, + "PREEMPTION_MODE": { + "env_var_name": "PREEMPTION_MODE", + "value": "", + "title": "Preemption Mode", + "description": "If 'recompute', the engine performs preemption-aware recomputation. If 'save', the engine saves activations into the CPU memory as preemption happens.", + "required": false, + "type": "text" + }, + "PREEMPTION_CHECK_PERIOD": { + "env_var_name": "PREEMPTION_CHECK_PERIOD", + "value": 1.0, + "title": "Preemption Check Period", + "description": "How frequently the engine checks if a preemption happens.", + "required": false, + "type": "number" + }, + "PREEMPTION_CPU_CAPACITY": { + "env_var_name": "PREEMPTION_CPU_CAPACITY", + "value": 2, + "title": "Preemption CPU Capacity", + "description": "The percentage of CPU memory used for the saved activations.", + "required": false, + "type": "number" + }, + "MAX_LOG_LEN": { + "env_var_name": "MAX_LOG_LEN", + "value": "", + "title": "Max Log Length", + "description": "Max number of characters or ID numbers being printed in log.", + "required": false, + "type": "number" + }, + "DISABLE_LOGGING_REQUEST": { + "env_var_name": "DISABLE_LOGGING_REQUEST", + "value": false, + "title": "Disable Logging Request", + "description": "Disable logging requests.", + "required": false, + "type": "toggle" + }, + "TOKENIZER_NAME": { + "env_var_name": "TOKENIZER_NAME", + "value": "", + "title": "Tokenizer Name", + "description": "Tokenizer repo to use a different tokenizer than the model's default", + "required": false, + "type": "text" + }, + "TOKENIZER_REVISION": { + "env_var_name": "TOKENIZER_REVISION", + "value": "", + "title": "Tokenizer Revision", + "description": "Tokenizer revision to load", + "required": false, + "type": "text" + }, + "CUSTOM_CHAT_TEMPLATE": { + "env_var_name": "CUSTOM_CHAT_TEMPLATE", + "value": "", + "title": "Custom Chat Template", + "description": "Custom chat jinja template", + "required": false, + "type": "text" + }, + "GPU_MEMORY_UTILIZATION": { + "env_var_name": "GPU_MEMORY_UTILIZATION", + "value": "0.95", + "title": "GPU Memory Utilization", + "description": "Sets GPU VRAM utilization", + "required": false, + "type": "number" + }, + "BLOCK_SIZE": { + "env_var_name": "BLOCK_SIZE", + "value": "16", + "title": "Block Size", + "description": "Token block size for contiguous chunks of tokens", + "required": false, + "type": "number" + }, + "SWAP_SPACE": { + "env_var_name": "SWAP_SPACE", + "value": "4", + "title": "Swap Space", + "description": "CPU swap space size (GiB) per GPU", + "required": false, + "type": "number" + }, + "ENFORCE_EAGER": { + "env_var_name": "ENFORCE_EAGER", + "value": false, + "title": "Enforce Eager", + "description": "Always use eager-mode PyTorch. If False (0), will use eager mode and CUDA graph in hybrid for maximal performance and flexibility", + "required": false, + "type": "toggle" + }, + "MAX_SEQ_LEN_TO_CAPTURE": { + "env_var_name": "MAX_SEQ_LEN_TO_CAPTURE", + "value": "8192", + "title": "CUDA Graph Max Content Length", + "description": "Maximum context length covered by CUDA graphs. If a sequence has context length larger than this, we fall back to eager mode", + "required": false, + "type": "number" + }, + "DISABLE_CUSTOM_ALL_REDUCE": { + "env_var_name": "DISABLE_CUSTOM_ALL_REDUCE", + "value": false, + "title": "Disable Custom All Reduce", + "description": "Enables or disables custom all reduce", + "required": false, + "type": "toggle" + }, + "DEFAULT_BATCH_SIZE": { + "env_var_name": "DEFAULT_BATCH_SIZE", + "value": "50", + "title": "Default Final Batch Size", + "description": "Default and Maximum batch size for token streaming to reduce HTTP calls", + "required": false, + "type": "number" + }, + "DEFAULT_MIN_BATCH_SIZE": { + "env_var_name": "DEFAULT_MIN_BATCH_SIZE", + "value": "1", + "title": "Default Starting Batch Size", + "description": "Batch size for the first request, which will be multiplied by the growth factor every subsequent request", + "required": false, + "type": "number" + }, + "DEFAULT_BATCH_SIZE_GROWTH_FACTOR": { + "env_var_name": "DEFAULT_BATCH_SIZE_GROWTH_FACTOR", + "value": "3", + "title": "Default Batch Size Growth Factor", + "description": "Growth factor for dynamic batch size", + "required": false, + "type": "number" + }, + "RAW_OPENAI_OUTPUT": { + "env_var_name": "RAW_OPENAI_OUTPUT", + "value": true, + "title": "Raw OpenAI Output", + "description": "Raw OpenAI output instead of just the text", + "required": false, + "type": "toggle" + }, + "OPENAI_RESPONSE_ROLE": { + "env_var_name": "OPENAI_RESPONSE_ROLE", + "value": "assistant", + "title": "OpenAI Response Role", + "description": "Role of the LLM's Response in OpenAI Chat Completions", + "required": false, + "type": "text" + }, + "OPENAI_SERVED_MODEL_NAME_OVERRIDE": { + "env_var_name": "OPENAI_SERVED_MODEL_NAME_OVERRIDE", + "value": "", + "title": "OpenAI Served Model Name Override", + "description": "Overrides the name of the served model from model repo/path to specified name, which you will then be able to use the value for the `model` parameter when making OpenAI requests", + "required": false, + "type": "text" + }, + "MAX_CONCURRENCY": { + "env_var_name": "MAX_CONCURRENCY", + "value": "300", + "title": "Max Concurrency", + "description": "Max concurrent requests per worker. vLLM has an internal queue, so you don't have to worry about limiting by VRAM, this is for improving scaling/load balancing efficiency", + "required": false, + "type": "number" + }, + "MODEL_REVISION": { + "env_var_name": "MODEL_REVISION", + "value": "", + "title": "Model Revision", + "description": "Model revision (branch) to load", + "required": false, + "type": "text" + }, + "BASE_PATH": { + "env_var_name": "BASE_PATH", + "value": "/runpod-volume", + "title": "Base Path", + "description": "Storage directory for Huggingface cache and model", + "required": false, + "type": "text" + } } -} +} \ No newline at end of file From 21a1e138b4c312d407e081c4d543163b5fdc3c97 Mon Sep 17 00:00:00 2001 From: carlson-svg Date: Sun, 18 Aug 2024 23:57:19 -0700 Subject: [PATCH 3/8] updated version of human readable config --- worker-config.json | 1167 +++++++++++++++++++++++--------------------- 1 file changed, 613 insertions(+), 554 deletions(-) diff --git a/worker-config.json b/worker-config.json index e9e8e37..f15ca2d 100644 --- a/worker-config.json +++ b/worker-config.json @@ -1,235 +1,286 @@ { - "versions": { - "0.5.3": { - "categories": [ - { - "title": "LLM Settings", - "settings": [ - "TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE", - "DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH", - "MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND", - "WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE", - "TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING", - "DISABLE_SLIDING_WINDOW", "USE_V2_BLOCK_MANAGER", "NUM_LOOKAHEAD_SLOTS", - "SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS", - "MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA", - "TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG", - "ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE", - "LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS", - "DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL", - "NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE", - "SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE", - "NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD", - "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA", - "MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD", - "PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST" - ] - }, - { - "title": "Tokenizer Settings", - "settings": [ - "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE" - ] - }, - { - "title": "System Settings", - "settings": [ - "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE", - "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE" - ] - }, - { - "title": "Streaming Settings", - "settings": [ - "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR" - ] - }, - { - "title": "OpenAI Settings", - "settings": [ - "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE" - ] - }, - { - "title": "Serverless Settings", - "settings": [ - "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS" - ] - } - ] - }, - "0.4.2": { - "categories": [ - { - "title": "LLM Settings", - "settings": [ - "MODEL_REVISION", "MAX_MODEL_LEN", "BASE_PATH", "LOAD_FORMAT", "QUANTIZATION", - "TRUST_REMOTE_CODE", "SEED", "KV_CACHE_DTYPE", "DTYPE" - ] - }, - { - "title": "Tokenizer Settings", - "settings": [ - "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE" - ] - }, - { - "title": "System Settings", - "settings": [ - "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE", - "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE" - ] - }, - { - "title": "Streaming Settings", - "settings": [ - "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR" - ] - }, - { - "title": "OpenAI Settings", - "settings": [ - "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE" - ] - }, - { - "title": "Serverless Settings", - "settings": [ - "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS" - ] - } - ] - }, - "0.3.1": { - "categories": [ - { - "title": "LLM Settings", - "settings": [ - "TOKENIZER", "TRUST_REMOTE_CODE" - ] - } - ] - } - }, - "schema": { - "TOKENIZER": { - "env_var_name": "TOKENIZER", - "value": "", - "title": "Tokenizer", - "description": "Name or path of the Hugging Face tokenizer to use.", - "required": false, - "type": "text" - }, - "TOKENIZER_MODE": { - "env_var_name": "TOKENIZER_MODE", - "value": "auto", - "title": "Tokenizer Mode", - "description": "The tokenizer mode.", - "required": false, - "type": "select", - "options": [ - { "value": "auto", "label": "auto" }, - { "value": "slow", "label": "slow" } - ] - }, - "SKIP_TOKENIZER_INIT": { - "env_var_name": "SKIP_TOKENIZER_INIT", - "value": false, - "title": "Skip Tokenizer Init", - "description": "Skip initialization of tokenizer and detokenizer.", - "required": false, - "type": "toggle" - }, - "TRUST_REMOTE_CODE": { - "env_var_name": "TRUST_REMOTE_CODE", - "value": false, - "title": "Trust Remote Code", - "description": "Trust remote code from Hugging Face.", - "required": false, - "type": "toggle" - }, - "DOWNLOAD_DIR": { - "env_var_name": "DOWNLOAD_DIR", - "value": "", - "title": "Download Directory", - "description": "Directory to download and load the weights.", - "required": false, - "type": "text" - }, - "LOAD_FORMAT": { - "env_var_name": "LOAD_FORMAT", - "value": "auto", - "title": "Load Format", - "description": "The format of the model weights to load.", - "required": false, - "type": "select", - "options": [ - { "value": "auto", "label": "auto" }, - { "value": "pt", "label": "pt" }, - { "value": "safetensors", "label": "safetensors" }, - { "value": "npcache", "label": "npcache" }, - { "value": "dummy", "label": "dummy" }, - { "value": "tensorizer", "label": "tensorizer" }, - { "value": "bitsandbytes", "label": "bitsandbytes" } - ] - }, - "DTYPE": { - "env_var_name": "DTYPE", - "value": "auto", - "title": "Data Type", - "description": "Data type for model weights and activations.", - "required": false, - "type": "select", - "options": [ - { "value": "auto", "label": "auto" }, - { "value": "half", "label": "half" }, - { "value": "float16", "label": "float16" }, - { "value": "bfloat16", "label": "bfloat16" }, - { "value": "float", "label": "float" }, - { "value": "float32", "label": "float32" } - ] - }, - "KV_CACHE_DTYPE": { - "env_var_name": "KV_CACHE_DTYPE", - "value": "auto", - "title": "KV Cache Data Type", - "description": "Data type for KV cache storage.", - "required": false, - "type": "select", - "options": [ - { "value": "auto", "label": "auto" }, - { "value": "fp8", "label": "fp8" } - ] - }, - "QUANTIZATION_PARAM_PATH": { - "env_var_name": "QUANTIZATION_PARAM_PATH", - "value": "", - "title": "Quantization Param Path", - "description": "Path to the JSON file containing the KV cache scaling factors.", - "required": false, - "type": "text" - }, - "MAX_MODEL_LEN": { - "env_var_name": "MAX_MODEL_LEN", - "value": "", - "title": "Max Model Length", - "description": "Model context length.", - "required": false, - "type": "number" - }, - "GUIDED_DECODING_BACKEND": { - "env_var_name": "GUIDED_DECODING_BACKEND", - "value": "outlines", - "title": "Guided Decoding Backend", - "description": "Which engine will be used for guided decoding by default.", - "required": false, - "type": "select", - "options": [ - { "value": "outlines", "label": "outlines" }, - { "value": "lm-format-enforcer", "label": "lm-format-enforcer" } - ] - }, - "DISTRIBUTED_EXECUTOR_BACKEND": { + "versions": { + "0.5.4": { + "imageName": "runpod/worker-v1-vllm:v1.2.0stable-cuda12.1.0 ", + "categories": [ + { + "title": "LLM Settings", + "settings": [ + "TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE", + "DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH", + "MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND", + "WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE", + "TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING", + "DISABLE_SLIDING_WINDOW", "USE_V2_BLOCK_MANAGER", "NUM_LOOKAHEAD_SLOTS", + "SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS", + "MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA", + "TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG", + "ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE", + "LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS", + "DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL", + "NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE", + "SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE", + "NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD", + "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA", + "MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD", + "PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST" + ] + }, + { + "title": "Tokenizer Settings", + "settings": [ + "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE" + ] + }, + { + "title": "System Settings", + "settings": [ + "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE", + "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE" + ] + }, + { + "title": "Streaming Settings", + "settings": [ + "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR" + ] + }, + { + "title": "OpenAI Settings", + "settings": [ + "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE" + ] + }, + { + "title": "Serverless Settings", + "settings": [ + "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS" + ] + } + ] + }, + "0.5.3": { + "imageName": "runpod/worker-v1-vllm:stable-cuda12.1.0", + "categories": [ + { + "title": "LLM Settings", + "settings": [ + "TOKENIZER", "TOKENIZER_MODE", "SKIP_TOKENIZER_INIT", "TRUST_REMOTE_CODE", + "DOWNLOAD_DIR", "LOAD_FORMAT", "DTYPE", "KV_CACHE_DTYPE", "QUANTIZATION_PARAM_PATH", + "MAX_MODEL_LEN", "GUIDED_DECODING_BACKEND", "DISTRIBUTED_EXECUTOR_BACKEND", + "WORKER_USE_RAY", "RAY_WORKERS_USE_NSIGHT", "PIPELINE_PARALLEL_SIZE", + "TENSOR_PARALLEL_SIZE", "MAX_PARALLEL_LOADING_WORKERS", "ENABLE_PREFIX_CACHING", + "DISABLE_SLIDING_WINDOW", "USE_V2_BLOCK_MANAGER", "NUM_LOOKAHEAD_SLOTS", + "SEED", "NUM_GPU_BLOCKS_OVERRIDE", "MAX_NUM_BATCHED_TOKENS", "MAX_NUM_SEQS", + "MAX_LOGPROBS", "DISABLE_LOG_STATS", "QUANTIZATION", "ROPE_SCALING", "ROPE_THETA", + "TOKENIZER_POOL_SIZE", "TOKENIZER_POOL_TYPE", "TOKENIZER_POOL_EXTRA_CONFIG", + "ENABLE_LORA", "MAX_LORAS", "MAX_LORA_RANK", "LORA_EXTRA_VOCAB_SIZE", + "LORA_DTYPE", "LONG_LORA_SCALING_FACTORS", "MAX_CPU_LORAS", "FULLY_SHARDED_LORAS", + "DEVICE", "SCHEDULER_DELAY_FACTOR", "ENABLE_CHUNKED_PREFILL", "SPECULATIVE_MODEL", + "NUM_SPECULATIVE_TOKENS", "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE", + "SPECULATIVE_MAX_MODEL_LEN", "SPECULATIVE_DISABLE_BY_BATCH_SIZE", + "NGRAM_PROMPT_LOOKUP_MAX", "NGRAM_PROMPT_LOOKUP_MIN", "SPEC_DECODING_ACCEPTANCE_METHOD", + "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA", + "MODEL_LOADER_EXTRA_CONFIG", "PREEMPTION_MODE", "PREEMPTION_CHECK_PERIOD", + "PREEMPTION_CPU_CAPACITY", "MAX_LOG_LEN", "DISABLE_LOGGING_REQUEST" + ] + }, + { + "title": "Tokenizer Settings", + "settings": [ + "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE" + ] + }, + { + "title": "System Settings", + "settings": [ + "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE", + "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE" + ] + }, + { + "title": "Streaming Settings", + "settings": [ + "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR" + ] + }, + { + "title": "OpenAI Settings", + "settings": [ + "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE" + ] + }, + { + "title": "Serverless Settings", + "settings": [ + "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS" + ] + } + ] + }, + "0.4.2": { + "imageName": "runpod/worker-vllm:stable-cuda12.1.0", + "categories": [ + { + "title": "LLM Settings", + "settings": [ + "MODEL_REVISION", "MAX_MODEL_LEN", "BASE_PATH", "LOAD_FORMAT", "QUANTIZATION", + "TRUST_REMOTE_CODE", "SEED", "KV_CACHE_DTYPE", "DTYPE" + ] + }, + { + "title": "Tokenizer Settings", + "settings": [ + "TOKENIZER_NAME", "TOKENIZER_REVISION", "CUSTOM_CHAT_TEMPLATE" + ] + }, + { + "title": "System Settings", + "settings": [ + "GPU_MEMORY_UTILIZATION", "MAX_PARALLEL_LOADING_WORKERS", "BLOCK_SIZE", + "SWAP_SPACE", "ENFORCE_EAGER", "MAX_SEQ_LEN_TO_CAPTURE", "DISABLE_CUSTOM_ALL_REDUCE" + ] + }, + { + "title": "Streaming Settings", + "settings": [ + "DEFAULT_BATCH_SIZE", "DEFAULT_MIN_BATCH_SIZE", "DEFAULT_BATCH_SIZE_GROWTH_FACTOR" + ] + }, + { + "title": "OpenAI Settings", + "settings": [ + "RAW_OPENAI_OUTPUT", "OPENAI_RESPONSE_ROLE", "OPENAI_SERVED_MODEL_NAME_OVERRIDE" + ] + }, + { + "title": "Serverless Settings", + "settings": [ + "MAX_CONCURRENCY", "DISABLE_LOG_STATS", "DISABLE_LOG_REQUESTS" + ] + } + ] + } + }, + "schema": { + "TOKENIZER": { + "env_var_name": "TOKENIZER", + "value": "", + "title": "Tokenizer", + "description": "Name or path of the Hugging Face tokenizer to use.", + "required": false, + "type": "text" + }, + "TOKENIZER_MODE": { + "env_var_name": "TOKENIZER_MODE", + "value": "auto", + "title": "Tokenizer Mode", + "description": "The tokenizer mode.", + "required": false, + "type": "select", + "options": [ + { "value": "auto", "label": "auto" }, + { "value": "slow", "label": "slow" } + ] + }, + "SKIP_TOKENIZER_INIT": { + "env_var_name": "SKIP_TOKENIZER_INIT", + "value": false, + "title": "Skip Tokenizer Init", + "description": "Skip initialization of tokenizer and detokenizer.", + "required": false, + "type": "toggle" + }, + "TRUST_REMOTE_CODE": { + "env_var_name": "TRUST_REMOTE_CODE", + "value": false, + "title": "Trust Remote Code", + "description": "Trust remote code from Hugging Face.", + "required": false, + "type": "toggle" + }, + "DOWNLOAD_DIR": { + "env_var_name": "DOWNLOAD_DIR", + "value": "", + "title": "Download Directory", + "description": "Directory to download and load the weights.", + "required": false, + "type": "text" + }, + "LOAD_FORMAT": { + "env_var_name": "LOAD_FORMAT", + "value": "auto", + "title": "Load Format", + "description": "The format of the model weights to load.", + "required": false, + "type": "select", + "options": [ + { "value": "auto", "label": "auto" }, + { "value": "pt", "label": "pt" }, + { "value": "safetensors", "label": "safetensors" }, + { "value": "npcache", "label": "npcache" }, + { "value": "dummy", "label": "dummy" }, + { "value": "tensorizer", "label": "tensorizer" }, + { "value": "bitsandbytes", "label": "bitsandbytes" } + ] + }, + "DTYPE": { + "env_var_name": "DTYPE", + "value": "auto", + "title": "Data Type", + "description": "Data type for model weights and activations.", + "required": false, + "type": "select", + "options": [ + { "value": "auto", "label": "auto" }, + { "value": "half", "label": "half" }, + { "value": "float16", "label": "float16" }, + { "value": "bfloat16", "label": "bfloat16" }, + { "value": "float", "label": "float" }, + { "value": "float32", "label": "float32" } + ] + }, + "KV_CACHE_DTYPE": { + "env_var_name": "KV_CACHE_DTYPE", + "value": "auto", + "title": "KV Cache Data Type", + "description": "Data type for KV cache storage.", + "required": false, + "type": "select", + "options": [ + { "value": "auto", "label": "auto" }, + { "value": "fp8", "label": "fp8" } + ] + }, + "QUANTIZATION_PARAM_PATH": { + "env_var_name": "QUANTIZATION_PARAM_PATH", + "value": "", + "title": "Quantization Param Path", + "description": "Path to the JSON file containing the KV cache scaling factors.", + "required": false, + "type": "text" + }, + "MAX_MODEL_LEN": { + "env_var_name": "MAX_MODEL_LEN", + "value": "", + "title": "Max Model Length", + "description": "Model context length.", + "required": false, + "type": "number" + }, + "GUIDED_DECODING_BACKEND": { + "env_var_name": "GUIDED_DECODING_BACKEND", + "value": "outlines", + "title": "Guided Decoding Backend", + "description": "Which engine will be used for guided decoding by default.", + "required": false, + "type": "select", + "options": [ + { "value": "outlines", "label": "outlines" }, + { "value": "lm-format-enforcer", "label": "lm-format-enforcer" } + ] + }, + "DISTRIBUTED_EXECUTOR_BACKEND": { "env_var_name": "DISTRIBUTED_EXECUTOR_BACKEND", "value": "", "title": "Distributed Executor Backend", @@ -470,329 +521,337 @@ "type": "text" }, "MAX_CPU_LORAS": { - "env_var_name": "MAX_CPU_LORAS", - "value": "", - "title": "Max CPU LoRAs", - "description": "Maximum number of LoRAs to store in CPU memory.", - "required": false, - "type": "number" - }, - "FULLY_SHARDED_LORAS": { - "env_var_name": "FULLY_SHARDED_LORAS", - "value": false, - "title": "Fully Sharded LoRAs", - "description": "Enable fully sharded LoRA layers.", - "required": false, - "type": "toggle" - }, - "DEVICE": { - "env_var_name": "DEVICE", - "value": "auto", - "title": "Device", - "description": "Device type for vLLM execution.", - "required": false, - "type": "select", - "options": [ - { "value": "auto", "label": "auto" }, - { "value": "cuda", "label": "cuda" }, - { "value": "neuron", "label": "neuron" }, - { "value": "cpu", "label": "cpu" }, - { "value": "openvino", "label": "openvino" }, - { "value": "tpu", "label": "tpu" }, - { "value": "xpu", "label": "xpu" } - ] - }, - "SCHEDULER_DELAY_FACTOR": { - "env_var_name": "SCHEDULER_DELAY_FACTOR", - "value": 0.0, - "title": "Scheduler Delay Factor", - "description": "Apply a delay before scheduling next prompt.", - "required": false, - "type": "number" - }, - "ENABLE_CHUNKED_PREFILL": { - "env_var_name": "ENABLE_CHUNKED_PREFILL", - "value": false, - "title": "Enable Chunked Prefill", - "description": "Enable chunked prefill requests.", - "required": false, - "type": "toggle" - }, - "SPECULATIVE_MODEL": { - "env_var_name": "SPECULATIVE_MODEL", - "value": "", - "title": "Speculative Model", - "description": "The name of the draft model to be used in speculative decoding.", - "required": false, - "type": "text" - }, - "NUM_SPECULATIVE_TOKENS": { - "env_var_name": "NUM_SPECULATIVE_TOKENS", - "value": "", - "title": "Num Speculative Tokens", - "description": "The number of speculative tokens to sample from the draft model.", - "required": false, - "type": "number" - }, - "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE": { - "env_var_name": "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE", - "value": "", - "title": "Speculative Draft Tensor Parallel Size", - "description": "Number of tensor parallel replicas for the draft model.", - "required": false, - "type": "number" - }, - "SPECULATIVE_MAX_MODEL_LEN": { - "env_var_name": "SPECULATIVE_MAX_MODEL_LEN", - "value": "", - "title": "Speculative Max Model Length", - "description": "The maximum sequence length supported by the draft model.", - "required": false, - "type": "number" - }, - "SPECULATIVE_DISABLE_BY_BATCH_SIZE": { - "env_var_name": "SPECULATIVE_DISABLE_BY_BATCH_SIZE", - "value": "", - "title": "Speculative Disable by Batch Size", - "description": "Disable speculative decoding if the number of enqueue requests is larger than this value.", - "required": false, - "type": "number" - }, - "NGRAM_PROMPT_LOOKUP_MAX": { - "env_var_name": "NGRAM_PROMPT_LOOKUP_MAX", - "value": "", - "title": "Ngram Prompt Lookup Max", - "description": "Max size of window for ngram prompt lookup in speculative decoding.", - "required": false, - "type": "number" - }, - "NGRAM_PROMPT_LOOKUP_MIN": { - "env_var_name": "NGRAM_PROMPT_LOOKUP_MIN", - "value": "", - "title": "Ngram Prompt Lookup Min", - "description": "Min size of window for ngram prompt lookup in speculative decoding.", - "required": false, - "type": "number" - }, - "SPEC_DECODING_ACCEPTANCE_METHOD": { - "env_var_name": "SPEC_DECODING_ACCEPTANCE_METHOD", - "value": "rejection_sampler", - "title": "Speculative Decoding Acceptance Method", - "description": "Specify the acceptance method for draft token verification in speculative decoding.", - "required": false, - "type": "select", - "options": [ - { "value": "rejection_sampler", "label": "rejection_sampler" }, - { "value": "typical_acceptance_sampler", "label": "typical_acceptance_sampler" } - ] - }, - "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD": { - "env_var_name": "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", - "value": "", - "title": "Typical Acceptance Sampler Posterior Threshold", - "description": "Set the lower bound threshold for the posterior probability of a token to be accepted.", - "required": false, - "type": "number" - }, - "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA": { - "env_var_name": "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA", - "value": "", - "title": "Typical Acceptance Sampler Posterior Alpha", - "description": "A scaling factor for the entropy-based threshold for token acceptance.", - "required": false, - "type": "number" - }, - "MODEL_LOADER_EXTRA_CONFIG": { - "env_var_name": "MODEL_LOADER_EXTRA_CONFIG", - "value": "", - "title": "Model Loader Extra Config", - "description": "Extra config for model loader.", - "required": false, - "type": "text" - }, - "PREEMPTION_MODE": { - "env_var_name": "PREEMPTION_MODE", - "value": "", - "title": "Preemption Mode", - "description": "If 'recompute', the engine performs preemption-aware recomputation. If 'save', the engine saves activations into the CPU memory as preemption happens.", - "required": false, - "type": "text" - }, - "PREEMPTION_CHECK_PERIOD": { - "env_var_name": "PREEMPTION_CHECK_PERIOD", - "value": 1.0, - "title": "Preemption Check Period", - "description": "How frequently the engine checks if a preemption happens.", - "required": false, - "type": "number" - }, - "PREEMPTION_CPU_CAPACITY": { - "env_var_name": "PREEMPTION_CPU_CAPACITY", - "value": 2, - "title": "Preemption CPU Capacity", - "description": "The percentage of CPU memory used for the saved activations.", - "required": false, - "type": "number" - }, - "MAX_LOG_LEN": { - "env_var_name": "MAX_LOG_LEN", - "value": "", - "title": "Max Log Length", - "description": "Max number of characters or ID numbers being printed in log.", - "required": false, - "type": "number" - }, - "DISABLE_LOGGING_REQUEST": { - "env_var_name": "DISABLE_LOGGING_REQUEST", - "value": false, - "title": "Disable Logging Request", - "description": "Disable logging requests.", - "required": false, - "type": "toggle" - }, - "TOKENIZER_NAME": { - "env_var_name": "TOKENIZER_NAME", - "value": "", - "title": "Tokenizer Name", - "description": "Tokenizer repo to use a different tokenizer than the model's default", - "required": false, - "type": "text" - }, - "TOKENIZER_REVISION": { - "env_var_name": "TOKENIZER_REVISION", - "value": "", - "title": "Tokenizer Revision", - "description": "Tokenizer revision to load", - "required": false, - "type": "text" - }, - "CUSTOM_CHAT_TEMPLATE": { - "env_var_name": "CUSTOM_CHAT_TEMPLATE", - "value": "", - "title": "Custom Chat Template", - "description": "Custom chat jinja template", - "required": false, - "type": "text" - }, - "GPU_MEMORY_UTILIZATION": { - "env_var_name": "GPU_MEMORY_UTILIZATION", - "value": "0.95", - "title": "GPU Memory Utilization", - "description": "Sets GPU VRAM utilization", - "required": false, - "type": "number" - }, - "BLOCK_SIZE": { - "env_var_name": "BLOCK_SIZE", - "value": "16", - "title": "Block Size", - "description": "Token block size for contiguous chunks of tokens", - "required": false, - "type": "number" - }, - "SWAP_SPACE": { - "env_var_name": "SWAP_SPACE", - "value": "4", - "title": "Swap Space", - "description": "CPU swap space size (GiB) per GPU", - "required": false, - "type": "number" - }, - "ENFORCE_EAGER": { - "env_var_name": "ENFORCE_EAGER", - "value": false, - "title": "Enforce Eager", - "description": "Always use eager-mode PyTorch. If False (0), will use eager mode and CUDA graph in hybrid for maximal performance and flexibility", - "required": false, - "type": "toggle" - }, - "MAX_SEQ_LEN_TO_CAPTURE": { - "env_var_name": "MAX_SEQ_LEN_TO_CAPTURE", - "value": "8192", - "title": "CUDA Graph Max Content Length", - "description": "Maximum context length covered by CUDA graphs. If a sequence has context length larger than this, we fall back to eager mode", - "required": false, - "type": "number" - }, - "DISABLE_CUSTOM_ALL_REDUCE": { - "env_var_name": "DISABLE_CUSTOM_ALL_REDUCE", - "value": false, - "title": "Disable Custom All Reduce", - "description": "Enables or disables custom all reduce", - "required": false, - "type": "toggle" - }, - "DEFAULT_BATCH_SIZE": { - "env_var_name": "DEFAULT_BATCH_SIZE", - "value": "50", - "title": "Default Final Batch Size", - "description": "Default and Maximum batch size for token streaming to reduce HTTP calls", - "required": false, - "type": "number" - }, - "DEFAULT_MIN_BATCH_SIZE": { - "env_var_name": "DEFAULT_MIN_BATCH_SIZE", - "value": "1", - "title": "Default Starting Batch Size", - "description": "Batch size for the first request, which will be multiplied by the growth factor every subsequent request", - "required": false, - "type": "number" - }, - "DEFAULT_BATCH_SIZE_GROWTH_FACTOR": { - "env_var_name": "DEFAULT_BATCH_SIZE_GROWTH_FACTOR", - "value": "3", - "title": "Default Batch Size Growth Factor", - "description": "Growth factor for dynamic batch size", - "required": false, - "type": "number" - }, - "RAW_OPENAI_OUTPUT": { - "env_var_name": "RAW_OPENAI_OUTPUT", - "value": true, - "title": "Raw OpenAI Output", - "description": "Raw OpenAI output instead of just the text", - "required": false, - "type": "toggle" - }, - "OPENAI_RESPONSE_ROLE": { - "env_var_name": "OPENAI_RESPONSE_ROLE", - "value": "assistant", - "title": "OpenAI Response Role", - "description": "Role of the LLM's Response in OpenAI Chat Completions", - "required": false, - "type": "text" - }, - "OPENAI_SERVED_MODEL_NAME_OVERRIDE": { - "env_var_name": "OPENAI_SERVED_MODEL_NAME_OVERRIDE", - "value": "", - "title": "OpenAI Served Model Name Override", - "description": "Overrides the name of the served model from model repo/path to specified name, which you will then be able to use the value for the `model` parameter when making OpenAI requests", - "required": false, - "type": "text" - }, - "MAX_CONCURRENCY": { - "env_var_name": "MAX_CONCURRENCY", - "value": "300", - "title": "Max Concurrency", - "description": "Max concurrent requests per worker. vLLM has an internal queue, so you don't have to worry about limiting by VRAM, this is for improving scaling/load balancing efficiency", - "required": false, - "type": "number" - }, - "MODEL_REVISION": { - "env_var_name": "MODEL_REVISION", - "value": "", - "title": "Model Revision", - "description": "Model revision (branch) to load", - "required": false, - "type": "text" - }, - "BASE_PATH": { - "env_var_name": "BASE_PATH", - "value": "/runpod-volume", - "title": "Base Path", - "description": "Storage directory for Huggingface cache and model", - "required": false, - "type": "text" + "env_var_name": "MAX_CPU_LORAS", + "value": "", + "title": "Max CPU LoRAs", + "description": "Maximum number of LoRAs to store in CPU memory.", + "required": false, + "type": "number" + }, + "FULLY_SHARDED_LORAS": { + "env_var_name": "FULLY_SHARDED_LORAS", + "value": false, + "title": "Fully Sharded LoRAs", + "description": "Enable fully sharded LoRA layers.", + "required": false, + "type": "toggle" + }, + "DEVICE": { + "env_var_name": "DEVICE", + "value": "auto", + "title": "Device", + "description": "Device type for vLLM execution.", + "required": false, + "type": "select", + "options": [ + { "value": "auto", "label": "auto" }, + { "value": "cuda", "label": "cuda" }, + { "value": "neuron", "label": "neuron" }, + { "value": "cpu", "label": "cpu" }, + { "value": "openvino", "label": "openvino" }, + { "value": "tpu", "label": "tpu" }, + { "value": "xpu", "label": "xpu" } + ] + }, + "SCHEDULER_DELAY_FACTOR": { + "env_var_name": "SCHEDULER_DELAY_FACTOR", + "value": 0.0, + "title": "Scheduler Delay Factor", + "description": "Apply a delay before scheduling next prompt.", + "required": false, + "type": "number" + }, + "ENABLE_CHUNKED_PREFILL": { + "env_var_name": "ENABLE_CHUNKED_PREFILL", + "value": false, + "title": "Enable Chunked Prefill", + "description": "Enable chunked prefill requests.", + "required": false, + "type": "toggle" + }, + "SPECULATIVE_MODEL": { + "env_var_name": "SPECULATIVE_MODEL", + "value": "", + "title": "Speculative Model", + "description": "The name of the draft model to be used in speculative decoding.", + "required": false, + "type": "text" + }, + "NUM_SPECULATIVE_TOKENS": { + "env_var_name": "NUM_SPECULATIVE_TOKENS", + "value": "", + "title": "Num Speculative Tokens", + "description": "The number of speculative tokens to sample from the draft model.", + "required": false, + "type": "number" + }, + "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE": { + "env_var_name": "SPECULATIVE_DRAFT_TENSOR_PARALLEL_SIZE", + "value": "", + "title": "Speculative Draft Tensor Parallel Size", + "description": "Number of tensor parallel replicas for the draft model.", + "required": false, + "type": "number" + }, + "SPECULATIVE_MAX_MODEL_LEN": { + "env_var_name": "SPECULATIVE_MAX_MODEL_LEN", + "value": "", + "title": "Speculative Max Model Length", + "description": "The maximum sequence length supported by the draft model.", + "required": false, + "type": "number" + }, + "SPECULATIVE_DISABLE_BY_BATCH_SIZE": { + "env_var_name": "SPECULATIVE_DISABLE_BY_BATCH_SIZE", + "value": "", + "title": "Speculative Disable by Batch Size", + "description": "Disable speculative decoding if the number of enqueue requests is larger than this value.", + "required": false, + "type": "number" + }, + "NGRAM_PROMPT_LOOKUP_MAX": { + "env_var_name": "NGRAM_PROMPT_LOOKUP_MAX", + "value": "", + "title": "Ngram Prompt Lookup Max", + "description": "Max size of window for ngram prompt lookup in speculative decoding.", + "required": false, + "type": "number" + }, + "NGRAM_PROMPT_LOOKUP_MIN": { + "env_var_name": "NGRAM_PROMPT_LOOKUP_MIN", + "value": "", + "title": "Ngram Prompt Lookup Min", + "description": "Min size of window for ngram prompt lookup in speculative decoding.", + "required": false, + "type": "number" + }, + "SPEC_DECODING_ACCEPTANCE_METHOD": { + "env_var_name": "SPEC_DECODING_ACCEPTANCE_METHOD", + "value": "rejection_sampler", + "title": "Speculative Decoding Acceptance Method", + "description": "Specify the acceptance method for draft token verification in speculative decoding.", + "required": false, + "type": "select", + "options": [ + { "value": "rejection_sampler", "label": "rejection_sampler" }, + { "value": "typical_acceptance_sampler", "label": "typical_acceptance_sampler" } + ] + }, + "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD": { + "env_var_name": "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_THRESHOLD", + "value": "", + "title": "Typical Acceptance Sampler Posterior Threshold", + "description": "Set the lower bound threshold for the posterior probability of a token to be accepted.", + "required": false, + "type": "number" + }, + "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA": { + "env_var_name": "TYPICAL_ACCEPTANCE_SAMPLER_POSTERIOR_ALPHA", + "value": "", + "title": "Typical Acceptance Sampler Posterior Alpha", + "description": "A scaling factor for the entropy-based threshold for token acceptance.", + "required": false, + "type": "number" + }, + "MODEL_LOADER_EXTRA_CONFIG": { + "env_var_name": "MODEL_LOADER_EXTRA_CONFIG", + "value": "", + "title": "Model Loader Extra Config", + "description": "Extra config for model loader.", + "required": false, + "type": "text" + }, + "PREEMPTION_MODE": { + "env_var_name": "PREEMPTION_MODE", + "value": "", + "title": "Preemption Mode", + "description": "If 'recompute', the engine performs preemption-aware recomputation. If 'save', the engine saves activations into the CPU memory as preemption happens.", + "required": false, + "type": "text" + }, + "PREEMPTION_CHECK_PERIOD": { + "env_var_name": "PREEMPTION_CHECK_PERIOD", + "value": 1.0, + "title": "Preemption Check Period", + "description": "How frequently the engine checks if a preemption happens.", + "required": false, + "type": "number" + }, + "PREEMPTION_CPU_CAPACITY": { + "env_var_name": "PREEMPTION_CPU_CAPACITY", + "value": 2, + "title": "Preemption CPU Capacity", + "description": "The percentage of CPU memory used for the saved activations.", + "required": false, + "type": "number" + }, + "MAX_LOG_LEN": { + "env_var_name": "MAX_LOG_LEN", + "value": "", + "title": "Max Log Length", + "description": "Max number of characters or ID numbers being printed in log.", + "required": false, + "type": "number" + }, + "DISABLE_LOGGING_REQUEST": { + "env_var_name": "DISABLE_LOGGING_REQUEST", + "value": false, + "title": "Disable Logging Request", + "description": "Disable logging requests.", + "required": false, + "type": "toggle" + }, + "TOKENIZER_NAME": { + "env_var_name": "TOKENIZER_NAME", + "value": "", + "title": "Tokenizer Name", + "description": "Tokenizer repo to use a different tokenizer than the model's default", + "required": false, + "type": "text" + }, + "TOKENIZER_REVISION": { + "env_var_name": "TOKENIZER_REVISION", + "value": "", + "title": "Tokenizer Revision", + "description": "Tokenizer revision to load", + "required": false, + "type": "text" + }, + "CUSTOM_CHAT_TEMPLATE": { + "env_var_name": "CUSTOM_CHAT_TEMPLATE", + "value": "", + "title": "Custom Chat Template", + "description": "Custom chat jinja template", + "required": false, + "type": "text" + }, + "GPU_MEMORY_UTILIZATION": { + "env_var_name": "GPU_MEMORY_UTILIZATION", + "value": "0.95", + "title": "GPU Memory Utilization", + "description": "Sets GPU VRAM utilization", + "required": false, + "type": "number" + }, + "BLOCK_SIZE": { + "env_var_name": "BLOCK_SIZE", + "value": "16", + "title": "Block Size", + "description": "Token block size for contiguous chunks of tokens", + "required": false, + "type": "number" + }, + "SWAP_SPACE": { + "env_var_name": "SWAP_SPACE", + "value": "4", + "title": "Swap Space", + "description": "CPU swap space size (GiB) per GPU", + "required": false, + "type": "number" + }, + "ENFORCE_EAGER": { + "env_var_name": "ENFORCE_EAGER", + "value": false, + "title": "Enforce Eager", + "description": "Always use eager-mode PyTorch. If False (0), will use eager mode and CUDA graph in hybrid for maximal performance and flexibility", + "required": false, + "type": "toggle" + }, + "MAX_SEQ_LEN_TO_CAPTURE": { + "env_var_name": "MAX_SEQ_LEN_TO_CAPTURE", + "value": "8192", + "title": "CUDA Graph Max Content Length", + "description": "Maximum context length covered by CUDA graphs. If a sequence has context length larger than this, we fall back to eager mode", + "required": false, + "type": "number" + }, + "DISABLE_CUSTOM_ALL_REDUCE": { + "env_var_name": "DISABLE_CUSTOM_ALL_REDUCE", + "value": false, + "title": "Disable Custom All Reduce", + "description": "Enables or disables custom all reduce", + "required": false, + "type": "toggle" + }, + "DEFAULT_BATCH_SIZE": { + "env_var_name": "DEFAULT_BATCH_SIZE", + "value": "50", + "title": "Default Final Batch Size", + "description": "Default and Maximum batch size for token streaming to reduce HTTP calls", + "required": false, + "type": "number" + }, + "DEFAULT_MIN_BATCH_SIZE": { + "env_var_name": "DEFAULT_MIN_BATCH_SIZE", + "value": "1", + "title": "Default Starting Batch Size", + "description": "Batch size for the first request, which will be multiplied by the growth factor every subsequent request", + "required": false, + "type": "number" + }, + "DEFAULT_BATCH_SIZE_GROWTH_FACTOR": { + "env_var_name": "DEFAULT_BATCH_SIZE_GROWTH_FACTOR", + "value": "3", + "title": "Default Batch Size Growth Factor", + "description": "Growth factor for dynamic batch size", + "required": false, + "type": "number" + }, + "RAW_OPENAI_OUTPUT": { + "env_var_name": "RAW_OPENAI_OUTPUT", + "value": true, + "title": "Raw OpenAI Output", + "description": "Raw OpenAI output instead of just the text", + "required": false, + "type": "toggle" + }, + "OPENAI_RESPONSE_ROLE": { + "env_var_name": "OPENAI_RESPONSE_ROLE", + "value": "assistant", + "title": "OpenAI Response Role", + "description": "Role of the LLM's Response in OpenAI Chat Completions", + "required": false, + "type": "text" + }, + "OPENAI_SERVED_MODEL_NAME_OVERRIDE": { + "env_var_name": "OPENAI_SERVED_MODEL_NAME_OVERRIDE", + "value": "", + "title": "OpenAI Served Model Name Override", + "description": "Overrides the name of the served model from model repo/path to specified name, which you will then be able to use the value for the `model` parameter when making OpenAI requests", + "required": false, + "type": "text" + }, + "MAX_CONCURRENCY": { + "env_var_name": "MAX_CONCURRENCY", + "value": "300", + "title": "Max Concurrency", + "description": "Max concurrent requests per worker. vLLM has an internal queue, so you don't have to worry about limiting by VRAM, this is for improving scaling/load balancing efficiency", + "required": false, + "type": "number" + }, + "MODEL_REVISION": { + "env_var_name": "MODEL_REVISION", + "value": "", + "title": "Model Revision", + "description": "Model revision (branch) to load", + "required": false, + "type": "text" + }, + "BASE_PATH": { + "env_var_name": "BASE_PATH", + "value": "/runpod-volume", + "title": "Base Path", + "description": "Storage directory for Huggingface cache and model", + "required": false, + "type": "text" + }, + "DISABLE_LOG_REQUESTS": { + "env_var_name": "DISABLE_LOG_REQUESTS", + "value": true, + "title": "Disable Log Requests", + "description": "Enables or disables vLLM request logging", + "required": false, + "type": "toggle" } } } \ No newline at end of file From e6172dddd4d429d0beba78f69e5979bb5de04054 Mon Sep 17 00:00:00 2001 From: carlson-svg Date: Mon, 19 Aug 2024 15:17:20 -0700 Subject: [PATCH 4/8] took out space in imageName from version 0.5.4 --- worker-config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/worker-config.json b/worker-config.json index f15ca2d..12e2697 100644 --- a/worker-config.json +++ b/worker-config.json @@ -1,7 +1,7 @@ { "versions": { "0.5.4": { - "imageName": "runpod/worker-v1-vllm:v1.2.0stable-cuda12.1.0 ", + "imageName": "runpod/worker-v1-vllm:v1.2.0stable-cuda12.1.0", "categories": [ { "title": "LLM Settings", From 1e9aeb6e8f3a3c77a67f59749b99f1e34cdfce7f Mon Sep 17 00:00:00 2001 From: carlson-svg Date: Wed, 21 Aug 2024 12:44:45 -0700 Subject: [PATCH 5/8] adding "minimum_cuda_version" to each version --- worker-config.json | 3 +++ 1 file changed, 3 insertions(+) diff --git a/worker-config.json b/worker-config.json index 12e2697..2cb1743 100644 --- a/worker-config.json +++ b/worker-config.json @@ -2,6 +2,7 @@ "versions": { "0.5.4": { "imageName": "runpod/worker-v1-vllm:v1.2.0stable-cuda12.1.0", + "minimum_cuda_version": "12.1", "categories": [ { "title": "LLM Settings", @@ -61,6 +62,7 @@ }, "0.5.3": { "imageName": "runpod/worker-v1-vllm:stable-cuda12.1.0", + "minimum_cuda_version": "12.1", "categories": [ { "title": "LLM Settings", @@ -120,6 +122,7 @@ }, "0.4.2": { "imageName": "runpod/worker-vllm:stable-cuda12.1.0", + "minimum_cuda_version": "12.1", "categories": [ { "title": "LLM Settings", From 825ef25b6034cbebe537e5c372ce6a9192a60256 Mon Sep 17 00:00:00 2001 From: carlson-svg Date: Wed, 21 Aug 2024 12:47:48 -0700 Subject: [PATCH 6/8] changed to minimumCudaVersion to camel case --- worker-config.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/worker-config.json b/worker-config.json index 2cb1743..bb25d01 100644 --- a/worker-config.json +++ b/worker-config.json @@ -2,7 +2,7 @@ "versions": { "0.5.4": { "imageName": "runpod/worker-v1-vllm:v1.2.0stable-cuda12.1.0", - "minimum_cuda_version": "12.1", + "minimumCudaVersion": "12.1", "categories": [ { "title": "LLM Settings", @@ -62,7 +62,7 @@ }, "0.5.3": { "imageName": "runpod/worker-v1-vllm:stable-cuda12.1.0", - "minimum_cuda_version": "12.1", + "minimumCudaVersion": "12.1", "categories": [ { "title": "LLM Settings", @@ -122,7 +122,7 @@ }, "0.4.2": { "imageName": "runpod/worker-vllm:stable-cuda12.1.0", - "minimum_cuda_version": "12.1", + "minimumCudaVersion": "12.1", "categories": [ { "title": "LLM Settings", From 39ce8a64c057d10f2ddd01c508e5b45f97a49d2a Mon Sep 17 00:00:00 2001 From: carlson-svg Date: Thu, 22 Aug 2024 15:40:54 -0700 Subject: [PATCH 7/8] initial documentation for worker-config.json --- README.md | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/README.md b/README.md index 1633a74..fba828b 100644 --- a/README.md +++ b/README.md @@ -512,4 +512,85 @@ Your list can contain any number of messages, and each message usually can have } ] ``` + +# Worker Config +## Description +The worker config is a JSON file that is used to build the form that helps users configure their serverless endpoint on the RunPod Web Interface. + +## Writing your worker-config.json +The JSON consists of two main parts, schema and versions. +- `schema`: Here you specify the form fields that will be displayed to the user. + - `env_var_name`: The name of the environment variable that is being set using the form field. + - `value`: This is the default value of the form field. It will be shown in the UI as such unless the user changes it. + - `title`: This is the title of the form field in the UI. + - `description`: This is the description of the form field in the UI. + - `required`: This is a boolean that specifies if the form field is required. + - `type`: This is the type of the form field. Options are: + - `text`: Environment variable is a string so user inputs text in form field. + - `select`: User selects one option from the dropdown. You must provide the `options` key value pair after type if using this. + - `toggle`: User toggles between true and false. + - `number`: User inputs a number in the form field. + - `options`: Specify the options the user can select from if the type is `select`. DO NOT include this unless the `type` is `select`. +- `versions`: This is where you call the form fields specified in `schema` and organize them into categories. + - `imageName`: This is the name of the Docker image that will be used to run the serverless endpoint. + - `minimumCudaVersion`: This is the minimum CUDA version that is required to run the serverless endpoint. + - `categories`: This is where you call the keys of the form fields specified in `schema` and organize them into categories. Each category is a toggle list of forms on the Web UI. + - `title`: This is the title of the category in the UI. + - `settings`: This is the array of settings schemas specified in `schema` associated with the category. + +## Example of schema +```json +{ + "schema": { + "TOKENIZER": { + "env_var_name": "TOKENIZER", + "value": "", + "title": "Tokenizer", + "description": "Name or path of the Hugging Face tokenizer to use.", + "required": false, + "type": "text" + }, + "TOKENIZER_MODE": { + "env_var_name": "TOKENIZER_MODE", + "value": "auto", + "title": "Tokenizer Mode", + "description": "The tokenizer mode.", + "required": false, + "type": "select", + "options": [ + { "value": "auto", "label": "auto" }, + { "value": "slow", "label": "slow" } + ] + }, + ... + } +} +``` + +## Example of versions +```json +{ + "versions": { + "0.5.4": { + "imageName": "runpod/worker-v1-vllm:v1.2.0stable-cuda12.1.0", + "minimumCudaVersion": "12.1", + "categories": [ + { + "title": "LLM Settings", + "settings": [ + "TOKENIZER", "TOKENIZER_MODE", "OTHER_SETTINGS_SCHEMA_KEYS_YOU_HAVE_SPECIFIED_0", ... + ] + }, + { + "title": "Tokenizer Settings", + "settings": [ + "OTHER_SETTINGS_SCHEMA_KEYS_0", "OTHER_SETTINGS_SCHEMA_KEYS_1", ... + ] + }, + ... + ] + } + } +} +``` \ No newline at end of file From 4fa4a8e0e6c51b0915df207ea92c4fa42708a34a Mon Sep 17 00:00:00 2001 From: carlson-svg Date: Mon, 26 Aug 2024 01:20:18 -0700 Subject: [PATCH 8/8] added to worker config docs to table of contents + added side note --- README.md | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/README.md b/README.md index 1633a74..c9c2c17 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,10 @@ Worker vLLM is now cached on all RunPod machines, resulting in near-instant depl - [Input Request Parameters](#input-request-parameters) - [Text Input Formats](#text-input-formats) - [Sampling Parameters](#sampling-parameters) +- [Worker Config](#worker-config) + - [Writing your worker-config.json](#writing-your-worker-configjson) + - [Example of schema](#example-of-schema) + - [Example of versions](#example-of-versions) # Setting up the Serverless Worker @@ -513,3 +517,86 @@ Your list can contain any number of messages, and each message usually can have ] ``` + + +# Worker Config +The worker config is a JSON file that is used to build the form that helps users configure their serverless endpoint on the RunPod Web Interface. + +Note: This is a new feature and only works for workers that use one model + +## Writing your worker-config.json +The JSON consists of two main parts, schema and versions. +- `schema`: Here you specify the form fields that will be displayed to the user. + - `env_var_name`: The name of the environment variable that is being set using the form field. + - `value`: This is the default value of the form field. It will be shown in the UI as such unless the user changes it. + - `title`: This is the title of the form field in the UI. + - `description`: This is the description of the form field in the UI. + - `required`: This is a boolean that specifies if the form field is required. + - `type`: This is the type of the form field. Options are: + - `text`: Environment variable is a string so user inputs text in form field. + - `select`: User selects one option from the dropdown. You must provide the `options` key value pair after type if using this. + - `toggle`: User toggles between true and false. + - `number`: User inputs a number in the form field. + - `options`: Specify the options the user can select from if the type is `select`. DO NOT include this unless the `type` is `select`. +- `versions`: This is where you call the form fields specified in `schema` and organize them into categories. + - `imageName`: This is the name of the Docker image that will be used to run the serverless endpoint. + - `minimumCudaVersion`: This is the minimum CUDA version that is required to run the serverless endpoint. + - `categories`: This is where you call the keys of the form fields specified in `schema` and organize them into categories. Each category is a toggle list of forms on the Web UI. + - `title`: This is the title of the category in the UI. + - `settings`: This is the array of settings schemas specified in `schema` associated with the category. + +## Example of schema +```json +{ + "schema": { + "TOKENIZER": { + "env_var_name": "TOKENIZER", + "value": "", + "title": "Tokenizer", + "description": "Name or path of the Hugging Face tokenizer to use.", + "required": false, + "type": "text" + }, + "TOKENIZER_MODE": { + "env_var_name": "TOKENIZER_MODE", + "value": "auto", + "title": "Tokenizer Mode", + "description": "The tokenizer mode.", + "required": false, + "type": "select", + "options": [ + { "value": "auto", "label": "auto" }, + { "value": "slow", "label": "slow" } + ] + }, + ... + } +} +``` + +## Example of versions +```json +{ + "versions": { + "0.5.4": { + "imageName": "runpod/worker-v1-vllm:v1.2.0stable-cuda12.1.0", + "minimumCudaVersion": "12.1", + "categories": [ + { + "title": "LLM Settings", + "settings": [ + "TOKENIZER", "TOKENIZER_MODE", "OTHER_SETTINGS_SCHEMA_KEYS_YOU_HAVE_SPECIFIED_0", ... + ] + }, + { + "title": "Tokenizer Settings", + "settings": [ + "OTHER_SETTINGS_SCHEMA_KEYS_0", "OTHER_SETTINGS_SCHEMA_KEYS_1", ... + ] + }, + ... + ] + } + } +} +``` \ No newline at end of file