Fix deprecated max_context_len_to_capture engine argument

runpod-workers · Jun 13, 2024 · 0e1e383 · 0e1e383
1 parent c8458fe
commit 0e1e383
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -115,7 +115,7 @@ Below is a summary of the available RunPod Worker images, categorized by image s
 | `BLOCK_SIZE`                        | `16`                 | `8`, `16`, `32`                           |Token block size for contiguous chunks of tokens. |
 | `SWAP_SPACE`                        | `4`                  | `int`                                         |CPU swap space size (GiB) per GPU. |
 | `ENFORCE_EAGER`                     | `0`                  | boolean as `int`                                         |Always use eager-mode PyTorch. If False(`0`), will use eager mode and CUDA graph in hybrid for maximal performance and flexibility. |
-| `MAX_CONTEXT_LEN_TO_CAPTURE`        | `8192`               | `int`                                     |Maximum context length covered by CUDA graphs. When a sequence has context length larger than this, we fall back to eager mode.|
+| `MAX_SEQ_LEN_TO_CAPTURE`        | `8192`               | `int`                                     |Maximum context length covered by CUDA graphs. When a sequence has context length larger than this, we fall back to eager mode.|
 | `DISABLE_CUSTOM_ALL_REDUCE`         | `0`                  | `int`                                         |Enables or disables custom all reduce. |
 **Streaming Batch Size Settings**:  
 | `DEFAULT_BATCH_SIZE`                | `50`                 | `int`                                         |Default and Maximum batch size for token streaming to reduce HTTP calls. |

diff --git a/src/config.py b/src/config.py
@@ -47,11 +47,16 @@ def _initialize_config(self):
             "kv_cache_dtype": os.getenv("KV_CACHE_DTYPE"),
             "block_size": int(os.getenv("BLOCK_SIZE")) if os.getenv("BLOCK_SIZE") else None,
             "swap_space": int(os.getenv("SWAP_SPACE")) if os.getenv("SWAP_SPACE") else None,
-            "max_context_len_to_capture": int(os.getenv("MAX_CONTEXT_LEN_TO_CAPTURE")) if os.getenv("MAX_CONTEXT_LEN_TO_CAPTURE") else None,
+            "max_seq_len_to_capture": int(os.getenv("MAX_SEQ_LEN_TO_CAPTURE")) if os.getenv("MAX_SEQ_LEN_TO_CAPTURE") else None,
             "disable_custom_all_reduce": get_int_bool_env("DISABLE_CUSTOM_ALL_REDUCE", False),
             "enforce_eager": get_int_bool_env("ENFORCE_EAGER", False)
         }
         if args["kv_cache_dtype"] == "fp8_e5m2":
             args["kv_cache_dtype"] = "fp8"
             logging.warning("Using fp8_e5m2 is deprecated. Please use fp8 instead.")
+        if os.getenv("MAX_CONTEXT_LEN_TO_CAPTURE"):
+            args["max_seq_len_to_capture"] = int(os.getenv("MAX_CONTEXT_LEN_TO_CAPTURE"))
+            logging.warning("Using MAX_CONTEXT_LEN_TO_CAPTURE is deprecated. Please use MAX_SEQ_LEN_TO_CAPTURE instead.")
+
+
         return {k: v for k, v in args.items() if v not in [None, ""]}