diff --git a/CHANGELOG.md b/CHANGELOG.md
index 98b0e0d77..2a62ded14 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,8 +1,10 @@
 # Change Log for SD.Next
 
-## Update for 2024-10-17
+## Update for 2024-10-18
 
-### Highlights for 2024-10-17
+### Highlights for 2024-10-18
+
+Workflow highlights:
 
 - **Reprocess**: New workflow options that allow you to generate at lower quality and then  
   reprocess at higher quality for select images only or generate without hires/refine and then reprocess with hires/refine  
@@ -10,6 +12,9 @@
 - **Detailer** Fully built-in detailer workflow without with support for all standard models  
 - Built-in **model analyzer**  
   See all details of your currently loaded model, including components, parameter count, layer count, etc.  
+
+Newly supported:
+
 - New fine-tuned [CLiP-ViT-L]((https://huggingface.co/zer0int/CLIP-GmP-ViT-L-14)) 1st stage **text-encoders** used by SD15, SDXL, Flux.1, etc. brings additional details to your images  
 - New models:
   - [CogView 3 Plus](https://huggingface.co/THUDM/CogView3-Plus-3B)  
@@ -18,10 +23,15 @@
   [Ctrl+X](https://github.com/genforce/ctrl-x) which allows for control of **structure and appearance** without the need for extra models,  
   [APG: Adaptive Projected Guidance](https://arxiv.org/pdf/2410.02416) for optimal **guidance** control,  
   [LinFusion](https://github.com/Huage001/LinFusion) for on-the-fly distillation of any sd15/sdxl model  
+
+Otherwise notable:
+
 - Several of [Flux.1](https://huggingface.co/black-forest-labs/FLUX.1-dev) optimizations and new quantization types  
 - Auto-detection of best available **device/dtype** settings for your platform and GPU reduces neeed for manual configuration  
 - Full rewrite of **sampler options**, not far more streamlined with tons of new options to tweak scheduler behavior  
 - Improved **LoRA** detection and handling for all supported models  
+- Tons of work on dynamic quantization that can be applied on-the-fly during model load to any model type  
+  Supported quantization engines include TorchAO, Optimum.quanto, NNCF compression, and more...  
 
 Oh, and we've compiled a full table with list of popular text-to-image generative models, their respective parameters and architecture overview: <https://github.com/vladmandic/automatic/wiki/Models>
 
@@ -30,7 +40,7 @@ And there are also other goodies like multiple *XYZ grid* improvements, addition
 [README](https://github.com/vladmandic/automatic/blob/master/README.md) | [CHANGELOG](https://github.com/vladmandic/automatic/blob/master/CHANGELOG.md) | [WiKi](https://github.com/vladmandic/automatic/wiki) | [Discord](https://discord.com/invite/sd-next-federal-batch-inspectors-1101998836328697867)
 
 
-### Details for 2024-10-17
+### Details for 2024-10-18
 
 - **reprocess**
   - new top-level button: reprocess latent from your history of generated image(s)  
@@ -211,6 +221,11 @@ And there are also other goodies like multiple *XYZ grid* improvements, addition
   - setting `lora_load_gpu` to load LoRA directly to GPU  
     *default*: true unless lovwram  
 
+- **torchao**
+  - reimplement torchao quantization
+  - configure in settings -> compute settings -> quantization
+  - can be applied to any model on-the-fly during load  
+
 - **huggingface**:  
   - force logout/login on token change  
   - unified handling of cache folder: set via `HF_HUB` or `HF_HUB_CACHE` or via settings -> system paths  
@@ -219,6 +234,9 @@ And there are also other goodies like multiple *XYZ grid* improvements, addition
   - add support for *image2video* (in addition to previous *text2video* and *video2video*)  
   - *note*: *image2video* requires separate 5b model variant  
 
+- **torch**  
+  - due to numerous issues with torch 2.5.0 which was just released as stable, we are sticking with 2.4.1 for now  
+
 - **backend=original** is now marked as in maintenance-only mode  
 - **python 3.12** improved compatibility, automatically handle `setuptools`  
 - **control**
@@ -233,10 +251,12 @@ And there are also other goodies like multiple *XYZ grid* improvements, addition
   - fix update infotext on image select  
   - fix imageviewer exif parser  
   - selectable info view in image viewer, thanks @ZeldaMaster501  
+  - setting to enable browser autolaunch, thanks @brknsoul  
 - **free-u** check if device/dtype are fft compatible and cast as necessary  
 - **rocm**
   - additional gpu detection and auto-config code, thanks @lshqqytiger  
   - experimental triton backend for flash attention, thanks @lshqqytiger  
+  - update to rocm 6.2, thanks @Disty0
 - **directml**  
   - update `torch` to 2.4.1, thanks @lshqqytiger  
 - **extensions**  
diff --git a/cli/load_unet.py b/cli/load_unet.py
new file mode 100644
index 000000000..2398cdb64
--- /dev/null
+++ b/cli/load_unet.py
@@ -0,0 +1,89 @@
+import torch
+import diffusers
+
+
+class StateDictStats():
+    cls: str = None
+    device: torch.device = None
+    params: int = 0
+    weights: dict = {}
+    dtypes: dict = {}
+    config: dict = None
+
+    def __repr__(self):
+        return f'cls={self.cls} params={self.params} weights={self.weights} device={self.device} dtypes={self.dtypes} config={self.config is not None}'
+
+
+def set_module_tensor(
+    module: torch.nn.Module,
+    name: str,
+    value: torch.Tensor,
+    stats: StateDictStats,
+    device: torch.device = None,
+    dtype: torch.dtype = None,
+):
+    if "." in name:
+        splits = name.split(".")
+        for split in splits[:-1]:
+            module = getattr(module, split)
+        name = splits[-1]
+    old_value = getattr(module, name)
+    with torch.no_grad():
+        if value.dtype not in stats.dtypes:
+            stats.dtypes[value.dtype] = 0
+        stats.dtypes[value.dtype] += 1
+        if name in module._buffers: # pylint: disable=protected-access
+            module._buffers[name] = value.to(device=device, dtype=dtype, non_blocking=True) # pylint: disable=protected-access
+            if 'buffers' not in stats.weights:
+                stats.weights['buffers'] = 0
+            stats.weights['buffers'] += 1
+        elif value is not None:
+            param_cls = type(module._parameters[name]) # pylint: disable=protected-access
+            module._parameters[name] = param_cls(value, requires_grad=old_value.requires_grad).to(device, dtype=dtype, non_blocking=True) # pylint: disable=protected-access
+            if 'parameters' not in stats.weights:
+                stats.weights['parameters'] = 0
+            stats.weights['parameters'] += 1
+
+
+def load_unet(config_file: str, state_dict: dict, device: torch.device = None, dtype: torch.dtype = None):
+    # same can be done for other modules or even for entire model by loading model config and then walking through its modules
+    from accelerate import init_empty_weights
+    with init_empty_weights():
+        stats = StateDictStats()
+        stats.device = device
+        stats.config = diffusers.UNet2DConditionModel.load_config(config_file)
+        unet = diffusers.UNet2DConditionModel.from_config(stats.config)
+        stats.cls = unet.__class__.__name__
+        expected_state_dict_keys = list(unet.state_dict().keys())
+        stats.weights['expected'] = len(expected_state_dict_keys)
+    for param_name, param in state_dict.items():
+        if param_name not in expected_state_dict_keys:
+            if 'unknown' not in stats.weights:
+                stats.weights['unknown'] = 0
+            stats.weights['unknown'] += 1
+            continue
+        set_module_tensor(unet, name=param_name, value=param, device=device, dtype=dtype, stats=stats)
+        state_dict[param_name] = None # unload as we initialize the model so we dont consume double the memory
+    stats.params = sum(p.numel() for p in unet.parameters(recurse=True))
+    return unet, stats
+
+
+def load_safetensors(fn: str):
+    import safetensors.torch
+    state_dict = safetensors.torch.load_file(fn, device='cpu') # state dict should always be loaded to cpu
+    return state_dict
+
+
+if __name__ == "__main__":
+    # need pipe already present to load unet state_dict into or we could load unet first and then manually create pipe with params
+    pipe = diffusers.StableDiffusionXLPipeline.from_single_file('/mnt/models/stable-diffusion/sdxl/TempestV0.1-Artistic.safetensors', cache_dir='/mnt/models/huggingface')
+    # this could be kept in memory so we dont have to reload it
+    dct = load_safetensors('/mnt/models/UNET/dpo-sdxl-text2image.safetensors')
+    pipe.unet, s = load_unet(
+        config_file = 'configs/sdxl/unet/config.json', # can also point to online hf model with subfolder
+        state_dict = dct,
+        device = torch.device('cpu'), # can leave out to use default device
+        dtype = torch.bfloat16, # can leave out to use default dtype, especially for mixed precision modules
+    )
+    from rich import print as rprint
+    rprint(f'Stats: {s}')
diff --git a/installer.py b/installer.py
index 66602ca0b..0c25f0446 100644
--- a/installer.py
+++ b/installer.py
@@ -212,7 +212,10 @@ def installed(package, friendly: str = None, reload = False, quiet = False):
             pkgs = [p for p in package.split() if not p.startswith('-') and not p.startswith('=')]
             pkgs = [p.split('/')[-1] for p in pkgs] # get only package name if installing from url
         for pkg in pkgs:
-            if '>=' in pkg:
+            if '!=' in pkg:
+                p = pkg.split('!=')
+                return True # check for not equal always return true
+            elif '>=' in pkg:
                 p = pkg.split('>=')
             else:
                 p = pkg.split('==')
@@ -485,7 +488,8 @@ def check_torchao():
 def install_cuda():
     log.info('CUDA: nVidia toolkit detected')
     install('onnxruntime-gpu', 'onnxruntime-gpu', ignore=True, quiet=True)
-    return os.environ.get('TORCH_COMMAND', 'torch torchvision --index-url https://download.pytorch.org/whl/cu124')
+    # return os.environ.get('TORCH_COMMAND', 'torch torchvision --index-url https://download.pytorch.org/whl/cu124')
+    return os.environ.get('TORCH_COMMAND', 'torch==2.4.1 torchvision==0.19.1 --index-url https://download.pytorch.org/whl/cu124')
 
 
 def install_rocm_zluda():
@@ -566,8 +570,11 @@ def install_rocm_zluda():
             log.info('Using CPU-only torch')
             torch_command = os.environ.get('TORCH_COMMAND', 'torch torchvision')
     else:
-        if rocm.version is None or float(rocm.version) > 6.1: # assume the latest if version check fails
-            torch_command = os.environ.get('TORCH_COMMAND', 'torch torchvision --index-url https://download.pytorch.org/whl/rocm6.1')
+        if rocm.version is None or float(rocm.version) >= 6.1: # assume the latest if version check fails
+            #torch_command = os.environ.get('TORCH_COMMAND', 'torch torchvision --index-url https://download.pytorch.org/whl/rocm6.1')
+            torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.4.1+rocm6.1 torchvision==0.19.1+rocm6.1 --index-url https://download.pytorch.org/whl/rocm6.1')
+        elif rocm.version == "6.0": # lock to 2.4.1, older rocm (5.7) uses torch 2.3
+            torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.4.1+rocm6.0 torchvision==0.19.1+rocm6.0 --index-url https://download.pytorch.org/whl/rocm6.0')
         elif float(rocm.version) < 5.5: # oldest supported version is 5.5
             log.warning(f"ROCm: unsupported version={rocm.version}")
             log.warning("ROCm: minimum supported version=5.5")
@@ -583,7 +590,7 @@ def install_rocm_zluda():
                 ort_package = os.environ.get('ONNXRUNTIME_PACKAGE', f"--pre onnxruntime-training{'' if ort_version is None else ('==' + ort_version)} --index-url https://pypi.lsh.sh/{rocm.version[0]}{rocm.version[2]} --extra-index-url https://pypi.org/simple")
             install(ort_package, 'onnxruntime-training')
 
-        if device is not None:
+        if installed("torch") and device is not None:
             if 'Flash attention' in opts.get('sdp_options'):
                 if not installed('flash-attn'):
                     install(rocm.get_flash_attention_command(device), reinstall=True)
@@ -616,26 +623,10 @@ def install_ipex(torch_command):
         os.environ.setdefault('ClDeviceGlobalMemSizeAvailablePercent', '100')
     if "linux" in sys.platform:
         torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.3.1+cxx11.abi torchvision==0.18.1+cxx11.abi intel-extension-for-pytorch==2.3.110+xpu oneccl_bind_pt==2.3.100+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/')
+        # torch_command = os.environ.get('TORCH_COMMAND', 'torch torchvision --index-url https://download.pytorch.org/whl/test/xpu') # test wheels are stable previews, significantly slower than IPEX
         # os.environ.setdefault('TENSORFLOW_PACKAGE', 'tensorflow==2.15.1 intel-extension-for-tensorflow[xpu]==2.15.0.1')
     else:
-        if sys.version_info.minor == 11:
-            pytorch_pip = 'https://github.com/Nuullll/intel-extension-for-pytorch/releases/download/v2.1.10%2Bxpu/torch-2.1.0a0+cxx11.abi-cp311-cp311-win_amd64.whl'
-            torchvision_pip = 'https://github.com/Nuullll/intel-extension-for-pytorch/releases/download/v2.1.10%2Bxpu/torchvision-0.16.0a0+cxx11.abi-cp311-cp311-win_amd64.whl'
-            ipex_pip = 'https://github.com/Nuullll/intel-extension-for-pytorch/releases/download/v2.1.10%2Bxpu/intel_extension_for_pytorch-2.1.10+xpu-cp311-cp311-win_amd64.whl'
-            torch_command = os.environ.get('TORCH_COMMAND', f'{pytorch_pip} {torchvision_pip} {ipex_pip}')
-        elif sys.version_info.minor == 10:
-            pytorch_pip = 'https://github.com/Nuullll/intel-extension-for-pytorch/releases/download/v2.1.10%2Bxpu/torch-2.1.0a0+cxx11.abi-cp310-cp310-win_amd64.whl'
-            torchvision_pip = 'https://github.com/Nuullll/intel-extension-for-pytorch/releases/download/v2.1.10%2Bxpu/torchvision-0.16.0a0+cxx11.abi-cp310-cp310-win_amd64.whl'
-            ipex_pip = 'https://github.com/Nuullll/intel-extension-for-pytorch/releases/download/v2.1.10%2Bxpu/intel_extension_for_pytorch-2.1.10+xpu-cp310-cp310-win_amd64.whl'
-            torch_command = os.environ.get('TORCH_COMMAND', f'{pytorch_pip} {torchvision_pip} {ipex_pip}')
-        else:
-            torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.1.0.post3 torchvision==0.16.0.post3 intel-extension-for-pytorch==2.1.40+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/')
-            if os.environ.get('DISABLE_VENV_LIBS', None) is None:
-                install(os.environ.get('MKL_PACKAGE', 'mkl==2024.2.0'), 'mkl')
-                install(os.environ.get('DPCPP_PACKAGE', 'mkl-dpcpp==2024.2.0'), 'mkl-dpcpp')
-                install(os.environ.get('ONECCL_PACKAGE', 'oneccl-devel==2021.13.0'), 'oneccl-devel')
-                install(os.environ.get('MPI_PACKAGE', 'impi-devel==2021.13.0'), 'impi-devel')
-        torch_command = os.environ.get('TORCH_COMMAND', f'{pytorch_pip} {torchvision_pip} {ipex_pip}')
+        torch_command = os.environ.get('TORCH_COMMAND', '--pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/xpu') # torchvision doesn't exist on test/stable branch for windows
     install(os.environ.get('OPENVINO_PACKAGE', 'openvino==2024.3.0'), 'openvino', ignore=True)
     install('nncf==2.7.0', 'nncf', ignore=True)
     install(os.environ.get('ONNXRUNTIME_PACKAGE', 'onnxruntime-openvino'), 'onnxruntime-openvino', ignore=True)
@@ -697,7 +688,7 @@ def check_torch():
     allow_ipex = not (args.use_cuda or args.use_rocm or args.use_directml or args.use_openvino)
     allow_directml = not (args.use_cuda or args.use_rocm or args.use_ipex or args.use_openvino)
     allow_openvino = not (args.use_cuda or args.use_rocm or args.use_ipex or args.use_directml)
-    log.debug(f'Torch overrides: cuda={args.use_cuda} rocm={args.use_rocm} ipex={args.use_ipex} diml={args.use_directml} openvino={args.use_openvino} zluda={args.use_zluda}')
+    log.debug(f'Torch overrides: cuda={args.use_cuda} rocm={args.use_rocm} ipex={args.use_ipex} directml={args.use_directml} openvino={args.use_openvino} zluda={args.use_zluda}')
     # log.debug(f'Torch allowed: cuda={allow_cuda} rocm={allow_rocm} ipex={allow_ipex} diml={allow_directml} openvino={allow_openvino}')
     torch_command = os.environ.get('TORCH_COMMAND', '')
 
@@ -1038,6 +1029,8 @@ def set_environment():
     os.environ.setdefault('UVICORN_TIMEOUT_KEEP_ALIVE', '60')
     os.environ.setdefault('KINETO_LOG_LEVEL', '3')
     os.environ.setdefault('DO_NOT_TRACK', '1')
+    os.environ.setdefault('UV_INDEX_STRATEGY', 'unsafe-any-match')
+    os.environ.setdefault('UV_NO_BUILD_ISOLATION', '1')
     os.environ.setdefault('HF_HUB_CACHE', opts.get('hfcache_dir', os.path.join(os.path.expanduser('~'), '.cache', 'huggingface', 'hub')))
     allocator = f'garbage_collection_threshold:{opts.get("torch_gc_threshold", 80)/100:0.2f},max_split_size_mb:512'
     if opts.get("torch_malloc", "native") == 'cudaMallocAsync':
diff --git a/javascript/sdnext.css b/javascript/sdnext.css
index 0f3765694..3c81e7d8e 100644
--- a/javascript/sdnext.css
+++ b/javascript/sdnext.css
@@ -55,6 +55,8 @@ td > div > span { overflow-y: auto; max-height: 3em; overflow-x: hidden; }
 .gradio-radio { padding: 0 !important; width: max-content !important; }
 .gradio-slider { margin-right: var(--spacing-sm) !important; width: max-content !important }
 .gradio-slider input[type="number"] { width: 5em; font-size: var(--text-xs); height: 16px; text-align: right; padding: 0; }
+.gradio-checkboxgroup { padding: 0 !important; }
+.gradio-checkbox > label { color: var(--block-title-text-color) !important; }
 
 /* custom gradio elements */
 .accordion-compact { padding: 8px 0px 4px 0px !important; }
diff --git a/modules/apg/__init__.py b/modules/apg/__init__.py
index 0bf6f1c8d..1dc7a619e 100644
--- a/modules/apg/__init__.py
+++ b/modules/apg/__init__.py
@@ -27,12 +27,15 @@ def project(
     v0: torch.Tensor, # [B, C, H, W]
     v1: torch.Tensor, # [B, C, H, W]
     ):
+    device = v0.device
     dtype = v0.dtype
+    if device.type == "xpu":
+        v0, v1 = v0.to("cpu"), v1.to("cpu")
     v0, v1 = v0.double(), v1.double()
     v1 = torch.nn.functional.normalize(v1, dim=[-1, -2, -3])
     v0_parallel = (v0 * v1).sum(dim=[-1, -2, -3], keepdim=True) * v1
     v0_orthogonal = v0 - v0_parallel
-    return v0_parallel.to(dtype), v0_orthogonal.to(dtype)
+    return v0_parallel.to(device, dtype=dtype), v0_orthogonal.to(device, dtype=dtype)
 
 
 def normalized_guidance(
diff --git a/modules/devices.py b/modules/devices.py
index f69d19f53..17e6c8f0f 100644
--- a/modules/devices.py
+++ b/modules/devices.py
@@ -291,6 +291,7 @@ def set_cudnn_params():
             torch.backends.cuda.matmul.allow_tf32 = True
             torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
             torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = True
+            torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True)
         except Exception:
             pass
         if torch.backends.cudnn.is_available():
diff --git a/modules/intel/ipex/__init__.py b/modules/intel/ipex/__init__.py
index b84a853ed..e1c476e7e 100644
--- a/modules/intel/ipex/__init__.py
+++ b/modules/intel/ipex/__init__.py
@@ -16,6 +16,13 @@ def ipex_init(): # pylint: disable=too-many-statements
         if hasattr(torch, "cuda") and hasattr(torch.cuda, "is_xpu_hijacked") and torch.cuda.is_xpu_hijacked:
             return True, "Skipping IPEX hijack"
         else:
+            try: # force xpu device on torch compile and triton
+                torch._inductor.utils.GPU_TYPES = ["xpu"]
+                torch._inductor.utils.get_gpu_type = lambda *args, **kwargs: "xpu"
+                from triton import backends as triton_backends # pylint: disable=import-error
+                triton_backends.backends["nvidia"].driver.is_active = lambda *args, **kwargs: False
+            except Exception:
+                pass
             # Replace cuda with xpu:
             torch.cuda.current_device = torch.xpu.current_device
             torch.cuda.current_stream = torch.xpu.current_stream
@@ -115,26 +122,26 @@ def ipex_init(): # pylint: disable=too-many-statements
                 torch.cuda.traceback = torch.xpu.traceback
 
             # Memory:
-            if 'linux' in sys.platform and "WSL2" in os.popen("uname -a").read():
+            if legacy and 'linux' in sys.platform and "WSL2" in os.popen("uname -a").read():
                 torch.xpu.empty_cache = lambda: None
             torch.cuda.empty_cache = torch.xpu.empty_cache
 
             if legacy:
-                torch.cuda.memory = torch.xpu.memory
-                torch.cuda.memory_stats = torch.xpu.memory_stats
                 torch.cuda.memory_summary = torch.xpu.memory_summary
                 torch.cuda.memory_snapshot = torch.xpu.memory_snapshot
-                torch.cuda.memory_allocated = torch.xpu.memory_allocated
-                torch.cuda.max_memory_allocated = torch.xpu.max_memory_allocated
-                torch.cuda.memory_reserved = torch.xpu.memory_reserved
-                torch.cuda.memory_cached = torch.xpu.memory_reserved
-                torch.cuda.max_memory_reserved = torch.xpu.max_memory_reserved
-                torch.cuda.max_memory_cached = torch.xpu.max_memory_reserved
-                torch.cuda.reset_peak_memory_stats = torch.xpu.reset_peak_memory_stats
-                torch.cuda.reset_max_memory_cached = torch.xpu.reset_peak_memory_stats
-                torch.cuda.reset_max_memory_allocated = torch.xpu.reset_peak_memory_stats
-                torch.cuda.memory_stats_as_nested_dict = torch.xpu.memory_stats_as_nested_dict
-                torch.cuda.reset_accumulated_memory_stats = torch.xpu.reset_accumulated_memory_stats
+            torch.cuda.memory = torch.xpu.memory
+            torch.cuda.memory_stats = torch.xpu.memory_stats
+            torch.cuda.memory_allocated = torch.xpu.memory_allocated
+            torch.cuda.max_memory_allocated = torch.xpu.max_memory_allocated
+            torch.cuda.memory_reserved = torch.xpu.memory_reserved
+            torch.cuda.memory_cached = torch.xpu.memory_reserved
+            torch.cuda.max_memory_reserved = torch.xpu.max_memory_reserved
+            torch.cuda.max_memory_cached = torch.xpu.max_memory_reserved
+            torch.cuda.reset_peak_memory_stats = torch.xpu.reset_peak_memory_stats
+            torch.cuda.reset_max_memory_cached = torch.xpu.reset_peak_memory_stats
+            torch.cuda.reset_max_memory_allocated = torch.xpu.reset_peak_memory_stats
+            torch.cuda.memory_stats_as_nested_dict = torch.xpu.memory_stats_as_nested_dict
+            torch.cuda.reset_accumulated_memory_stats = torch.xpu.reset_accumulated_memory_stats
 
             # RNG:
             torch.cuda.get_rng_state = torch.xpu.get_rng_state
@@ -183,7 +190,8 @@ def ipex_init(): # pylint: disable=too-many-statements
                 torch._C._XpuDeviceProperties.minor = 1
 
             # Fix functions with ipex:
-            torch.cuda.mem_get_info = lambda device=None: [(torch.xpu.get_device_properties(device).total_memory - torch.xpu.memory_reserved(device)), torch.xpu.get_device_properties(device).total_memory]
+            torch.xpu.mem_get_info = lambda device=None: [(torch.xpu.get_device_properties(device).total_memory - torch.xpu.memory_reserved(device)), torch.xpu.get_device_properties(device).total_memory]
+            torch.cuda.mem_get_info = torch.xpu.mem_get_info
             torch._utils._get_available_device_type = lambda: "xpu"
             torch.has_cuda = True
             torch.cuda.has_half = True
@@ -192,13 +200,13 @@ def ipex_init(): # pylint: disable=too-many-statements
             torch.backends.cuda.is_built = lambda *args, **kwargs: True
             torch.version.cuda = "12.1"
             torch.cuda.get_arch_list = lambda: ["ats-m150", "pvc"]
-            torch.cuda.get_device_capability = lambda *args, **kwargs: [12,1]
+            torch.cuda.get_device_capability = lambda *args, **kwargs: (12,1)
             torch.cuda.get_device_properties.major = 12
             torch.cuda.get_device_properties.minor = 1
             torch.cuda.ipc_collect = lambda *args, **kwargs: None
             torch.cuda.utilization = lambda *args, **kwargs: 0
 
-            ipex_hijacks()
+            ipex_hijacks(legacy=legacy)
             try:
                 from .diffusers import ipex_diffusers
                 ipex_diffusers()
diff --git a/modules/intel/ipex/hijacks.py b/modules/intel/ipex/hijacks.py
index bb79053c4..7ec94138d 100644
--- a/modules/intel/ipex/hijacks.py
+++ b/modules/intel/ipex/hijacks.py
@@ -293,7 +293,9 @@ def torch_load(f, map_location=None, *args, **kwargs):
 
 
 # Hijack Functions:
-def ipex_hijacks():
+def ipex_hijacks(legacy=True):
+    if legacy:
+        torch.nn.functional.interpolate = interpolate
     torch.tensor = torch_tensor
     torch.Tensor.to = Tensor_to
     torch.Tensor.cuda = Tensor_cuda
@@ -319,7 +321,6 @@ def ipex_hijacks():
     torch.nn.functional.layer_norm = functional_layer_norm
     torch.nn.functional.linear = functional_linear
     torch.nn.functional.conv2d = functional_conv2d
-    torch.nn.functional.interpolate = interpolate
     torch.nn.functional.pad = functional_pad
 
     torch.bmm = torch_bmm
diff --git a/modules/interrogate.py b/modules/interrogate.py
index 68c8aca00..5ae06fb90 100644
--- a/modules/interrogate.py
+++ b/modules/interrogate.py
@@ -252,6 +252,8 @@ def get_clip_models():
 
 
 def load_interrogator(clip_model, blip_model):
+    from installer import install
+    install('clip_interrogator==0.6.0')
     import clip_interrogator
     clip_interrogator.CAPTION_MODELS = caption_models
     global ci # pylint: disable=global-statement
diff --git a/modules/loader.py b/modules/loader.py
index 18cb42893..a2970abfd 100644
--- a/modules/loader.py
+++ b/modules/loader.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 from functools import partial
+import os
 import re
 import sys
 import logging
@@ -13,7 +14,11 @@
 logging.getLogger("DeepSpeed").disabled = True
 
 
+os.environ.setdefault('TORCH_LOGS', '-all')
 import torch # pylint: disable=C0411
+if torch.__version__.startswith('2.5.0'):
+    errors.log.warning(f'Disabling cuDNN for SDP on torch={torch.__version__}')
+    torch.backends.cuda.enable_cudnn_sdp(False)
 try:
     import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
     errors.log.debug(f'Load IPEX=={ipex.__version__}')
@@ -96,7 +101,6 @@ def get_packages():
     }
 
 try:
-    import os
     import math
     cores = os.cpu_count()
     affinity = len(os.sched_getaffinity(0))
diff --git a/modules/memstats.py b/modules/memstats.py
index eab4a677a..c417165a2 100644
--- a/modules/memstats.py
+++ b/modules/memstats.py
@@ -27,12 +27,12 @@ def gb(val: float):
         s = torch.cuda.mem_get_info()
         gpu = { 'used': gb(s[1] - s[0]), 'total': gb(s[1]) }
         s = dict(torch.cuda.memory_stats())
-        if s['num_ooms'] > 0:
+        if s.get('num_ooms', 0) > 0:
             shared.state.oom = True
         mem.update({
             'gpu': gpu,
-            'retries': s['num_alloc_retries'],
-            'oom': s['num_ooms']
+            'retries': s.get('num_alloc_retries', 0),
+            'oom': s.get('num_ooms', 0)
         })
         return mem
     except Exception:
diff --git a/modules/model_stablecascade.py b/modules/model_stablecascade.py
index 44cea1818..444d11adc 100644
--- a/modules/model_stablecascade.py
+++ b/modules/model_stablecascade.py
@@ -6,7 +6,7 @@
 
 
 def get_timestep_ratio_conditioning(t, alphas_cumprod):
-    s = torch.tensor([0.008]) # diffusers uses 0.003 while the original is 0.008
+    s = torch.tensor([0.008])
     clamp_range = [0, 1]
     min_var = torch.cos(s / (1 + s) * torch.pi * 0.5) ** 2
     var = alphas_cumprod[t]
@@ -133,8 +133,6 @@ def load_cascade_combined(checkpoint_info, diffusers_load_config):
         sd_model = StableCascadeCombinedPipeline.from_pretrained(checkpoint_info.path, cache_dir=shared.opts.diffusers_dir, **diffusers_load_config)
 
     sd_model.prior_pipe.scheduler.config.clip_sample = False
-    sd_model.default_scheduler = copy.deepcopy(sd_model.prior_pipe.scheduler)
-    sd_model.prior_pipe.get_timestep_ratio_conditioning = get_timestep_ratio_conditioning
     sd_model.decoder_pipe.text_encoder = sd_model.text_encoder = None  # Nothing uses the decoder's text encoder
     sd_model.prior_pipe.image_encoder = sd_model.prior_image_encoder = None # No img2img is implemented yet
     sd_model.prior_pipe.feature_extractor = sd_model.prior_feature_extractor = None # No img2img is implemented yet
diff --git a/modules/processing.py b/modules/processing.py
index 471053f50..04350ee39 100644
--- a/modules/processing.py
+++ b/modules/processing.py
@@ -254,6 +254,7 @@ def process_init(p: StableDiffusionProcessing):
         p.all_prompts, p.all_negative_prompts = shared.prompt_styles.apply_styles_to_prompts(p.all_prompts, p.all_negative_prompts, p.styles, p.all_seeds)
         p.prompts = p.all_prompts[p.iteration * p.batch_size:(p.iteration+1) * p.batch_size]
         p.negative_prompts = p.all_negative_prompts[p.iteration * p.batch_size:(p.iteration+1) * p.batch_size]
+        p.prompts, _ = extra_networks.parse_prompts(p.prompts)
 
 
 def process_images_inner(p: StableDiffusionProcessing) -> Processed:
diff --git a/modules/processing_diffusers.py b/modules/processing_diffusers.py
index cd0763eb9..9b485d361 100644
--- a/modules/processing_diffusers.py
+++ b/modules/processing_diffusers.py
@@ -404,8 +404,10 @@ def process_diffusers(p: processing.StableDiffusionProcessing):
         shared.sd_model = sd_models.set_diffuser_pipe(shared.sd_model, sd_models.DiffusersTaskType.INPAINTING) # force pipeline
         if len(getattr(p, 'init_images', [])) == 0:
             p.init_images = [TF.to_pil_image(torch.rand((3, getattr(p, 'height', 512), getattr(p, 'width', 512))))]
-    p.prompts = p.all_prompts[p.iteration * p.batch_size:(p.iteration+1) * p.batch_size]
-    p.negative_prompts = p.all_negative_prompts[p.iteration * p.batch_size:(p.iteration+1) * p.batch_size]
+    if p.prompts is None or len(p.prompts) == 0:
+        p.prompts = p.all_prompts[p.iteration * p.batch_size:(p.iteration+1) * p.batch_size]
+    if p.negative_prompts is None or len(p.negative_prompts) == 0:
+        p.negative_prompts = p.all_negative_prompts[p.iteration * p.batch_size:(p.iteration+1) * p.batch_size]
 
     sd_models.move_model(shared.sd_model, devices.device)
     sd_models_compile.openvino_recompile_model(p, hires=False, refiner=False) # recompile if a parameter changes
diff --git a/modules/sd_hijack.py b/modules/sd_hijack.py
index 8ddd1f8ec..75e51f4da 100644
--- a/modules/sd_hijack.py
+++ b/modules/sd_hijack.py
@@ -20,7 +20,6 @@
 
 import modules.textual_inversion.textual_inversion
 from modules import devices, sd_hijack_optimizations
-from modules import sd_hijack_clip, sd_hijack_open_clip, sd_hijack_unet, sd_hijack_xlmr, xlmr
 from modules.hypernetworks import hypernetwork
 
 attention_CrossAttention_forward = ldm.modules.attention.CrossAttention.forward
@@ -40,6 +39,7 @@
 
 def apply_optimizations():
     undo_optimizations()
+    from modules import sd_hijack_unet
     ldm.modules.diffusionmodules.model.nonlinearity = silu
     ldm.modules.diffusionmodules.openaimodel.th = sd_hijack_unet.th
     optimization_method = None
@@ -159,6 +159,7 @@ def __init__(self):
         self.embedding_db.add_embedding_dir(shared.opts.embeddings_dir)
 
     def hijack(self, m):
+        from modules import sd_hijack_clip, sd_hijack_open_clip, sd_hijack_unet, sd_hijack_xlmr, xlmr
         if type(m.cond_stage_model) == xlmr.BertSeriesModelWithTransformation:
             model_embeddings = m.cond_stage_model.roberta.embeddings
             model_embeddings.token_embedding = EmbeddingsWithFixes(model_embeddings.word_embeddings, self)
@@ -223,6 +224,7 @@ def flatten(el):
         self.layers = flatten(m)
 
     def undo_hijack(self, m):
+        from modules import sd_hijack_clip, sd_hijack_open_clip, xlmr
         if not hasattr(m, 'cond_stage_model'):
             return # not ldm model
         if type(m.cond_stage_model) == xlmr.BertSeriesModelWithTransformation:
diff --git a/modules/sd_models.py b/modules/sd_models.py
index ef3960fcf..47d0c5947 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -737,8 +737,8 @@ def eval_model(model, op=None, sd_model=None): # pylint: disable=unused-argument
                 model.eval()
             return model
         sd_model = sd_models_compile.apply_compile_to_model(sd_model, eval_model, ["Model", "VAE", "Text Encoder"], op="eval")
-    if shared.opts.diffusers_quantization:
-        sd_model = sd_models_compile.dynamic_quantization(sd_model)
+    if len(shared.opts.torchao_quantization) > 0:
+        sd_model = sd_models_compile.torchao_quantization(sd_model)
 
     if shared.opts.opt_channelslast and hasattr(sd_model, 'unet'):
         shared.log.debug(f'Setting {op}: channels-last=True')
@@ -1193,7 +1193,7 @@ def load_diffuser_file(model_type, pipeline, checkpoint_info, diffusers_load_con
                 from diffusers.utils import import_utils
                 import_utils._accelerate_available = False # pylint: disable=protected-access
             if shared.opts.diffusers_to_gpu and model_type.startswith('Stable Diffusion'):
-                shared.log.debug(f'Diffusers accelerate: hijack={shared.opts.diffusers_to_gpu}')
+                shared.log.debug(f'Diffusers accelerate: direct={shared.opts.diffusers_to_gpu}')
                 sd_hijack_accelerate.hijack_accelerate()
             else:
                 sd_hijack_accelerate.restore_accelerate()
@@ -1298,7 +1298,10 @@ def load_diffuser(checkpoint_info=None, already_loaded_state_dict=None, timer=No
         sd_model.sd_model_hash = checkpoint_info.calculate_shorthash() # pylint: disable=attribute-defined-outside-init
         sd_model.sd_checkpoint_info = checkpoint_info # pylint: disable=attribute-defined-outside-init
         sd_model.sd_model_checkpoint = checkpoint_info.filename # pylint: disable=attribute-defined-outside-init
-        sd_model.default_scheduler = copy.deepcopy(sd_model.scheduler) if hasattr(sd_model, "scheduler") else None
+        if hasattr(sd_model, "prior_pipe"):
+            sd_model.default_scheduler = copy.deepcopy(sd_model.prior_pipe.scheduler) if hasattr(sd_model.prior_pipe, "scheduler") else None
+        else:
+            sd_model.default_scheduler = copy.deepcopy(sd_model.scheduler) if hasattr(sd_model, "scheduler") else None
         sd_model.is_sdxl = False # a1111 compatibility item
         sd_model.is_sd2 = hasattr(sd_model, 'cond_stage_model') and hasattr(sd_model.cond_stage_model, 'model') # a1111 compatibility item
         sd_model.is_sd1 = not sd_model.is_sd2 # a1111 compatibility item
diff --git a/modules/sd_models_compile.py b/modules/sd_models_compile.py
index ff4fa6aff..91ed84ded 100644
--- a/modules/sd_models_compile.py
+++ b/modules/sd_models_compile.py
@@ -183,7 +183,7 @@ def nncf_compress_model(model, op=None, sd_model=None):
 def nncf_compress_weights(sd_model):
     try:
         t0 = time.time()
-        shared.log.info(f"NNCF Compress Weights: {shared.opts.nncf_compress_weights}")
+        shared.log.info(f"Quantization: type=NNCF modules={shared.opts.nncf_compress_weights}")
         global quant_last_model_name, quant_last_model_device # pylint: disable=global-statement
         install('nncf==2.7.0', quiet=True)
 
@@ -199,9 +199,9 @@ def nncf_compress_weights(sd_model):
         quant_last_model_device = None
 
         t1 = time.time()
-        shared.log.info(f"NNCF Compress Weights: time={t1-t0:.2f}")
+        shared.log.info(f"Quantization: type=NNCF time={t1-t0:.2f}")
     except Exception as e:
-        shared.log.warning(f"NNCF Compress Weights: error: {e}")
+        shared.log.warning(f"Quantization: type=NNCF {e}")
     return sd_model
 
 
@@ -249,10 +249,10 @@ def optimum_quanto_model(model, op=None, sd_model=None, weights=None, activation
 def optimum_quanto_weights(sd_model):
     try:
         if shared.opts.diffusers_offload_mode in {"balanced", "sequential"}:
-            shared.log.warning(f"Optimum Quanto Weights is incompatible with {shared.opts.diffusers_offload_mode} offload!")
+            shared.log.warning(f"Quantization: type=Optimum.quanto offload={shared.opts.diffusers_offload_mode} not compatible")
             return sd_model
         t0 = time.time()
-        shared.log.info(f"Optimum Quanto Weights: {shared.opts.optimum_quanto_weights}")
+        shared.log.info(f"Quantization: type=Optimum.quanto: modules={shared.opts.optimum_quanto_weights}")
         global quant_last_model_name, quant_last_model_device # pylint: disable=global-statement
         quanto = model_quant.load_quanto()
         quanto.tensor.qbits.QBitsTensor.create = lambda *args, **kwargs: quanto.tensor.qbits.QBitsTensor(*args, **kwargs)
@@ -299,9 +299,9 @@ def encode_prompt(*args, **kwargs):
             devices.torch_gc(force=True)
 
         t1 = time.time()
-        shared.log.info(f"Optimum Quanto Weights: time={t1-t0:.2f}")
+        shared.log.info(f"Quantization: type=Optimum.quanto time={t1-t0:.2f}")
     except Exception as e:
-        shared.log.warning(f"Optimum Quanto Weights: error: {e}")
+        shared.log.warning(f"Quantization: type=Optimum.quanto {e}")
     return sd_model
 
 
@@ -329,7 +329,7 @@ def compile_onediff(sd_model):
         from onediff.infer_compiler import oneflow_compile
 
     except Exception as e:
-        shared.log.warning(f"Model compile using onediff/oneflow: {e}")
+        shared.log.warning(f"Model compile: task=onediff {e}")
         return sd_model
 
     try:
@@ -351,9 +351,9 @@ def compile_onediff(sd_model):
         if shared.opts.cuda_compile_precompile:
             sd_model("dummy prompt")
         t1 = time.time()
-        shared.log.info(f"Model compile: task=onediff/oneflow time={t1-t0:.2f}")
+        shared.log.info(f"Model compile: task=onediff time={t1-t0:.2f}")
     except Exception as e:
-        shared.log.info(f"Model compile: task=onediff/oneflow error: {e}")
+        shared.log.info(f"Model compile: task=onediff {e}")
     return sd_model
 
 
@@ -361,7 +361,7 @@ def compile_stablefast(sd_model):
     try:
         import sfast.compilers.stable_diffusion_pipeline_compiler as sf
     except Exception as e:
-        shared.log.warning(f'Model compile using stable-fast: {e}')
+        shared.log.warning(f'Model compile: task=stablefast: {e}')
         return sd_model
     config = sf.CompilationConfig.Default()
     try:
@@ -390,9 +390,9 @@ def compile_stablefast(sd_model):
         if shared.opts.cuda_compile_precompile:
             sd_model("dummy prompt")
         t1 = time.time()
-        shared.log.info(f"Model compile: task='Stable-fast' config={config.__dict__} time={t1-t0:.2f}")
+        shared.log.info(f"Model compile: task=stablefast config={config.__dict__} time={t1-t0:.2f}")
     except Exception as e:
-        shared.log.info(f"Model compile: task=Stable-fast error: {e}")
+        shared.log.info(f"Model compile: task=stablefast {e}")
     return sd_model
 
 
@@ -401,7 +401,7 @@ def compile_torch(sd_model):
         t0 = time.time()
         import torch._dynamo # pylint: disable=unused-import,redefined-outer-name
         torch._dynamo.reset() # pylint: disable=protected-access
-        shared.log.debug(f"Model compile available backends: {torch._dynamo.list_backends()}") # pylint: disable=protected-access
+        shared.log.debug(f"Model compile: task=torch backends={torch._dynamo.list_backends()}") # pylint: disable=protected-access
 
         def torch_compile_model(model, op=None, sd_model=None): # pylint: disable=unused-argument
             if hasattr(model, "device") and model.device.type != "meta":
@@ -442,7 +442,7 @@ def torch_compile_model(model, op=None, sd_model=None): # pylint: disable=unused
             torch._inductor.config.use_mixed_mm = True # pylint: disable=protected-access
             # torch._inductor.config.force_fuse_int_mm_with_mul = True # pylint: disable=protected-access
         except Exception as e:
-            shared.log.error(f"Torch inductor config error: {e}")
+            shared.log.error(f"Model compile: torch inductor config error: {e}")
 
         sd_model = apply_compile_to_model(sd_model, function=torch_compile_model, options=shared.opts.cuda_compile, op="compile")
 
@@ -450,9 +450,9 @@ def torch_compile_model(model, op=None, sd_model=None): # pylint: disable=unused
         if shared.opts.cuda_compile_precompile:
             sd_model("dummy prompt")
         t1 = time.time()
-        shared.log.info(f"Model compile: time={t1-t0:.2f}")
+        shared.log.info(f"Model compile: task=torch time={t1-t0:.2f}")
     except Exception as e:
-        shared.log.warning(f"Model compile error: {e}")
+        shared.log.warning(f"Model compile: task=torch {e}")
     return sd_model
 
 
@@ -467,19 +467,19 @@ def check_deepcache(enable: bool):
 def compile_deepcache(sd_model):
     global deepcache_worker # pylint: disable=global-statement
     if not hasattr(sd_model, 'unet'):
-        shared.log.warning(f'Model compile using deep-cache: {sd_model.__class__} not supported')
+        shared.log.warning(f'Model compile: task=deepcache pipeline={sd_model.__class__} not supported')
         return sd_model
     try:
         from DeepCache import DeepCacheSDHelper
     except Exception as e:
-        shared.log.warning(f'Model compile using deep-cache: {e}')
+        shared.log.warning(f'Model compile: task=deepcache {e}')
         return sd_model
     t0 = time.time()
     check_deepcache(False)
     deepcache_worker = DeepCacheSDHelper(pipe=sd_model)
     deepcache_worker.set_params(cache_interval=shared.opts.deep_cache_interval, cache_branch_id=0)
     t1 = time.time()
-    shared.log.info(f"Model compile: task='DeepCache' config={deepcache_worker.params} time={t1-t0:.2f}")
+    shared.log.info(f"Model compile: task=deepcache config={deepcache_worker.params} time={t1-t0:.2f}")
     # config={'cache_interval': 3, 'cache_layer_id': 0, 'cache_block_id': 0, 'skip_mode': 'uniform'} time=0.00
     return sd_model
 
@@ -503,40 +503,56 @@ def compile_diffusers(sd_model):
     return sd_model
 
 
-def dynamic_quantization(sd_model):
+def torchao_quantization(sd_model):
     try:
         install('torchao', quiet=True)
-        from torchao.quantization import autoquant
+        from torchao import quantization as q
     except Exception as e:
-        shared.log.error(f"Model dynamic quantization not supported: {e}")
+        shared.log.error(f"Quantization: type=TorchAO quantization not supported: {e}")
         return sd_model
-
-    """
-    from torchao.quantization import quant_api
-    def dynamic_quant_filter_fn(mod, *args): # pylint: disable=unused-argument
-        return (isinstance(mod, torch.nn.Linear) and mod.in_features > 16 and (mod.in_features, mod.out_features)
-                not in [(1280, 640), (1920, 1280), (1920, 640), (2048, 1280), (2048, 2560), (2560, 1280), (256, 128), (2816, 1280), (320, 640), (512, 1536), (512, 256), (512, 512), (640, 1280), (640, 1920), (640, 320), (640, 5120), (640, 640), (960, 320), (960, 640)])
-
-    def conv_filter_fn(mod, *args): # pylint: disable=unused-argument
-        return (isinstance(mod, torch.nn.Conv2d) and mod.kernel_size == (1, 1) and 128 in [mod.in_channels, mod.out_channels])
-
-    quant_api.swap_conv2d_1x1_to_linear(sd_model.unet, conv_filter_fn)
-    quant_api.swap_conv2d_1x1_to_linear(sd_model.vae, conv_filter_fn)
-    quant_api.apply_dynamic_quant(sd_model.unet, dynamic_quant_filter_fn)
-    quant_api.apply_dynamic_quant(sd_model.vae, dynamic_quant_filter_fn)
-    """
-
-    shared.log.info(f"Model dynamic quantization: pipeline={sd_model.__class__.__name__}")
+    if shared.opts.torchao_quantization_type == "int8+act":
+        fn = q.int8_dynamic_activation_int8_weight
+    elif shared.opts.torchao_quantization_type == "int8":
+        fn = q.int8_weight_only
+    elif shared.opts.torchao_quantization_type == "int4":
+        fn = q.int4_weight_only
+    elif shared.opts.torchao_quantization_type == "fp8+act":
+        fn = q.float8_dynamic_activation_float8_weight
+    elif shared.opts.torchao_quantization_type == "fp8":
+        fn = q.float8_weight_only
+    elif shared.opts.torchao_quantization_type == "fpx":
+        fn = q.fpx_weight_only
+    else:
+        shared.log.error(f"Quantization: type=TorchAO type={shared.opts.torchao_quantization_type} not supported")
+        return sd_model
+    shared.log.info(f"Quantization: type=TorchAO pipe={sd_model.__class__.__name__} quant={shared.opts.torchao_quantization_type} fn={fn} targets={shared.opts.torchao_quantization}")
     try:
-        if shared.sd_model_type == 'sd' or shared.sd_model_type == 'sdxl':
-            sd_model.unet = sd_model.unet.to(devices.device)
-            sd_model.unet = autoquant(sd_model.unet, error_on_unseen=False)
-        elif shared.sd_model_type == 'f1':
-            sd_model.transformer = autoquant(sd_model.transformer, error_on_unseen=False)
-        else:
-            shared.log.error(f"Model dynamic quantization not supported: {shared.sd_model_type}")
+        t0 = time.time()
+        modules = []
+        if hasattr(sd_model, 'unet') and 'Model' in shared.opts.torchao_quantization:
+            modules.append('unet')
+            q.quantize_(sd_model.unet, fn(), device=devices.device)
+        if hasattr(sd_model, 'transformer') and 'Model' in shared.opts.torchao_quantization:
+            modules.append('transformer')
+            q.quantize_(sd_model.transformer, fn(), device=devices.device)
+            # sd_model.transformer = q.autoquant(sd_model.transformer, error_on_unseen=False)
+        if hasattr(sd_model, 'vae') and 'VAE' in shared.opts.torchao_quantization:
+            modules.append('vae')
+            q.quantize_(sd_model.vae, fn(), device=devices.device)
+        if hasattr(sd_model, 'text_encoder') and 'Text Encoder' in shared.opts.torchao_quantization:
+            modules.append('te1')
+            q.quantize_(sd_model.text_encoder, fn(), device=devices.device)
+        if hasattr(sd_model, 'text_encoder_2') and 'Text Encoder' in shared.opts.torchao_quantization:
+            modules.append('te2')
+            q.quantize_(sd_model.text_encoder_2, fn(), device=devices.device)
+        if hasattr(sd_model, 'text_encoder_3') and 'Text Encoder' in shared.opts.torchao_quantization:
+            modules.append('te3')
+            q.quantize_(sd_model.text_encoder_3, fn(), device=devices.device)
+        t1 = time.time()
+        shared.log.info(f"Quantization: type=TorchAO modules={modules} time={t1-t0:.2f}")
     except Exception as e:
-        shared.log.error(f"Model dynamic quantization: {e}")
+        shared.log.error(f"Quantization: type=TorchAO {e}")
+    setup_logging() # torchao uses dynamo which messes with logging so reset is needed
     return sd_model
 
 
diff --git a/modules/shared.py b/modules/shared.py
index 6f7c14d75..69574ff92 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -492,11 +492,12 @@ def get_default_modes():
 
     "quant_sep": OptionInfo("<h2>Model Quantization</h2>", "", gr.HTML, {"visible": native}),
     "quant_shuffle_weights": OptionInfo(False, "Shuffle the weights between GPU and CPU when quantizing", gr.Checkbox, {"visible": native}),
-    "diffusers_quantization": OptionInfo(False, "Dynamic quantization with TorchAO", gr.Checkbox, {"visible": native}),
-    "nncf_compress_weights": OptionInfo([], "Compress Model weights with NNCF INT8", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder", "ControlNet"], "visible": native}),
-    "optimum_quanto_weights": OptionInfo([], "Quantize Model weights with Optimum Quanto", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder", "ControlNet"], "visible": native}),
-    "optimum_quanto_weights_type": OptionInfo("qint8", "Weights type for Optimum Quanto", gr.Radio, {"choices": ['qint8', 'qfloat8_e4m3fn', 'qfloat8_e5m2', 'qint4', 'qint2'], "visible": native}),
-    "optimum_quanto_activations_type": OptionInfo("none", "Activations type for Optimum Quanto", gr.Radio, {"choices": ['none', 'qint8', 'qfloat8_e4m3fn', 'qfloat8_e5m2'], "visible": native}),
+    "nncf_compress_weights": OptionInfo([], "NNCF int8 compression enabled", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder", "ControlNet"], "visible": native}),
+    "optimum_quanto_weights": OptionInfo([], "Optimum.quanto quantization enabled", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder", "ControlNet"], "visible": native}),
+    "optimum_quanto_weights_type": OptionInfo("qint8", "Optimum.quanto quantization type", gr.Radio, {"choices": ['qint8', 'qfloat8_e4m3fn', 'qfloat8_e5m2', 'qint4', 'qint2'], "visible": native}),
+    "optimum_quanto_activations_type": OptionInfo("none", "Optimum.quanto quantization activations ", gr.Radio, {"choices": ['none', 'qint8', 'qfloat8_e4m3fn', 'qfloat8_e5m2'], "visible": native}),
+    "torchao_quantization": OptionInfo([], "TorchAO quantization enabled", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder"], "visible": native}),
+    "torchao_quantization_type": OptionInfo("int8", "TorchAO quantization type", gr.Radio, {"choices": ["int8+act", "int8", "int4", "fp8+act", "fp8", "fpx"], "visible": native}),
 
     "ipex_sep": OptionInfo("<h2>IPEX</h2>", "", gr.HTML, {"visible": devices.backend == "ipex"}),
     "ipex_optimize": OptionInfo([], "IPEX Optimize for Intel GPUs", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder", "Upscaler"], "visible": devices.backend == "ipex"}),
@@ -712,6 +713,7 @@ def get_default_modes():
     "theme_type": OptionInfo("Standard", "Theme type", gr.Radio, {"choices": ["Modern", "Standard", "None"]}),
     "theme_style": OptionInfo("Auto", "Theme mode", gr.Radio, {"choices": ["Auto", "Dark", "Light"]}),
     "gradio_theme": OptionInfo("black-teal", "UI theme", gr.Dropdown, lambda: {"choices": theme.list_themes()}, refresh=theme.refresh_themes),
+    "autolaunch": OptionInfo(False, "Autolaunch browser upon startup"),
     "font_size": OptionInfo(14, "Font size", gr.Slider, {"minimum": 8, "maximum": 32, "step": 1, "visible": True}),
     "tooltips": OptionInfo("UI Tooltips", "UI tooltips", gr.Radio, {"choices": ["None", "Browser default", "UI tooltips"], "visible": False}),
     "aspect_ratios": OptionInfo("1:1, 4:3, 3:2, 16:9, 16:10, 21:9, 2:3, 3:4, 9:16, 10:16, 9:21", "Allowed aspect ratios"),
diff --git a/requirements.txt b/requirements.txt
index 4999331ca..374451a57 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,6 @@ patch-ng
 anyio
 addict
 astunparse
-clean-fid
 filetype
 future
 GitPython
@@ -12,7 +11,6 @@ inflection
 jsonmerge
 kornia
 lark
-lpips
 omegaconf
 optimum
 piexif
@@ -35,8 +33,6 @@ peft==0.13.1
 httpx==0.24.1
 compel==2.0.3
 torchsde==0.2.6
-open-clip-torch
-clip-interrogator==0.6.0
 antlr4-python3-runtime==4.9.3
 requests==2.32.3
 tqdm==4.66.5
@@ -65,3 +61,7 @@ torchdiffeq
 dctorch
 scikit-image
 seam-carving
+open-clip-torch
+
+# TODO temporary block for torch==2.5.0
+torchvision!=0.20.0
diff --git a/webui.py b/webui.py
index f0f29e13a..32e7ed75b 100644
--- a/webui.py
+++ b/webui.py
@@ -350,7 +350,7 @@ def webui(restart=False):
                 continue
             logger.handlers = log.handlers
         # autolaunch only on initial start
-        if cmd_opts.autolaunch and local_url is not None:
+        if (shared.opts.autolaunch or cmd_opts.autolaunch) and local_url is not None:
             cmd_opts.autolaunch = False
             shared.log.info('Launching browser')
             import webbrowser