Skip to content

Commit

Permalink
Fix deprecated max_context_len_to_capture engine argument
Browse files Browse the repository at this point in the history
  • Loading branch information
alpayariyak committed Jun 13, 2024
1 parent c8458fe commit 0e1e383
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 2 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ Below is a summary of the available RunPod Worker images, categorized by image s
| `BLOCK_SIZE` | `16` | `8`, `16`, `32` |Token block size for contiguous chunks of tokens. |
| `SWAP_SPACE` | `4` | `int` |CPU swap space size (GiB) per GPU. |
| `ENFORCE_EAGER` | `0` | boolean as `int` |Always use eager-mode PyTorch. If False(`0`), will use eager mode and CUDA graph in hybrid for maximal performance and flexibility. |
| `MAX_CONTEXT_LEN_TO_CAPTURE` | `8192` | `int` |Maximum context length covered by CUDA graphs. When a sequence has context length larger than this, we fall back to eager mode.|
| `MAX_SEQ_LEN_TO_CAPTURE` | `8192` | `int` |Maximum context length covered by CUDA graphs. When a sequence has context length larger than this, we fall back to eager mode.|
| `DISABLE_CUSTOM_ALL_REDUCE` | `0` | `int` |Enables or disables custom all reduce. |
**Streaming Batch Size Settings**:
| `DEFAULT_BATCH_SIZE` | `50` | `int` |Default and Maximum batch size for token streaming to reduce HTTP calls. |
Expand Down
7 changes: 6 additions & 1 deletion src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,16 @@ def _initialize_config(self):
"kv_cache_dtype": os.getenv("KV_CACHE_DTYPE"),
"block_size": int(os.getenv("BLOCK_SIZE")) if os.getenv("BLOCK_SIZE") else None,
"swap_space": int(os.getenv("SWAP_SPACE")) if os.getenv("SWAP_SPACE") else None,
"max_context_len_to_capture": int(os.getenv("MAX_CONTEXT_LEN_TO_CAPTURE")) if os.getenv("MAX_CONTEXT_LEN_TO_CAPTURE") else None,
"max_seq_len_to_capture": int(os.getenv("MAX_SEQ_LEN_TO_CAPTURE")) if os.getenv("MAX_SEQ_LEN_TO_CAPTURE") else None,
"disable_custom_all_reduce": get_int_bool_env("DISABLE_CUSTOM_ALL_REDUCE", False),
"enforce_eager": get_int_bool_env("ENFORCE_EAGER", False)
}
if args["kv_cache_dtype"] == "fp8_e5m2":
args["kv_cache_dtype"] = "fp8"
logging.warning("Using fp8_e5m2 is deprecated. Please use fp8 instead.")
if os.getenv("MAX_CONTEXT_LEN_TO_CAPTURE"):
args["max_seq_len_to_capture"] = int(os.getenv("MAX_CONTEXT_LEN_TO_CAPTURE"))
logging.warning("Using MAX_CONTEXT_LEN_TO_CAPTURE is deprecated. Please use MAX_SEQ_LEN_TO_CAPTURE instead.")


return {k: v for k, v in args.items() if v not in [None, ""]}

0 comments on commit 0e1e383

Please sign in to comment.