diff --git a/CHANGELOG.md b/CHANGELOG.md index 98b0e0d77..2a62ded14 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,10 @@ # Change Log for SD.Next -## Update for 2024-10-17 +## Update for 2024-10-18 -### Highlights for 2024-10-17 +### Highlights for 2024-10-18 + +Workflow highlights: - **Reprocess**: New workflow options that allow you to generate at lower quality and then reprocess at higher quality for select images only or generate without hires/refine and then reprocess with hires/refine @@ -10,6 +12,9 @@ - **Detailer** Fully built-in detailer workflow without with support for all standard models - Built-in **model analyzer** See all details of your currently loaded model, including components, parameter count, layer count, etc. + +Newly supported: + - New fine-tuned [CLiP-ViT-L]((https://huggingface.co/zer0int/CLIP-GmP-ViT-L-14)) 1st stage **text-encoders** used by SD15, SDXL, Flux.1, etc. brings additional details to your images - New models: - [CogView 3 Plus](https://huggingface.co/THUDM/CogView3-Plus-3B) @@ -18,10 +23,15 @@ [Ctrl+X](https://github.com/genforce/ctrl-x) which allows for control of **structure and appearance** without the need for extra models, [APG: Adaptive Projected Guidance](https://arxiv.org/pdf/2410.02416) for optimal **guidance** control, [LinFusion](https://github.com/Huage001/LinFusion) for on-the-fly distillation of any sd15/sdxl model + +Otherwise notable: + - Several of [Flux.1](https://huggingface.co/black-forest-labs/FLUX.1-dev) optimizations and new quantization types - Auto-detection of best available **device/dtype** settings for your platform and GPU reduces neeed for manual configuration - Full rewrite of **sampler options**, not far more streamlined with tons of new options to tweak scheduler behavior - Improved **LoRA** detection and handling for all supported models +- Tons of work on dynamic quantization that can be applied on-the-fly during model load to any model type + Supported quantization engines include TorchAO, Optimum.quanto, NNCF compression, and more... Oh, and we've compiled a full table with list of popular text-to-image generative models, their respective parameters and architecture overview: @@ -30,7 +40,7 @@ And there are also other goodies like multiple *XYZ grid* improvements, addition [README](https://github.com/vladmandic/automatic/blob/master/README.md) | [CHANGELOG](https://github.com/vladmandic/automatic/blob/master/CHANGELOG.md) | [WiKi](https://github.com/vladmandic/automatic/wiki) | [Discord](https://discord.com/invite/sd-next-federal-batch-inspectors-1101998836328697867) -### Details for 2024-10-17 +### Details for 2024-10-18 - **reprocess** - new top-level button: reprocess latent from your history of generated image(s) @@ -211,6 +221,11 @@ And there are also other goodies like multiple *XYZ grid* improvements, addition - setting `lora_load_gpu` to load LoRA directly to GPU *default*: true unless lovwram +- **torchao** + - reimplement torchao quantization + - configure in settings -> compute settings -> quantization + - can be applied to any model on-the-fly during load + - **huggingface**: - force logout/login on token change - unified handling of cache folder: set via `HF_HUB` or `HF_HUB_CACHE` or via settings -> system paths @@ -219,6 +234,9 @@ And there are also other goodies like multiple *XYZ grid* improvements, addition - add support for *image2video* (in addition to previous *text2video* and *video2video*) - *note*: *image2video* requires separate 5b model variant +- **torch** + - due to numerous issues with torch 2.5.0 which was just released as stable, we are sticking with 2.4.1 for now + - **backend=original** is now marked as in maintenance-only mode - **python 3.12** improved compatibility, automatically handle `setuptools` - **control** @@ -233,10 +251,12 @@ And there are also other goodies like multiple *XYZ grid* improvements, addition - fix update infotext on image select - fix imageviewer exif parser - selectable info view in image viewer, thanks @ZeldaMaster501 + - setting to enable browser autolaunch, thanks @brknsoul - **free-u** check if device/dtype are fft compatible and cast as necessary - **rocm** - additional gpu detection and auto-config code, thanks @lshqqytiger - experimental triton backend for flash attention, thanks @lshqqytiger + - update to rocm 6.2, thanks @Disty0 - **directml** - update `torch` to 2.4.1, thanks @lshqqytiger - **extensions** diff --git a/cli/load_unet.py b/cli/load_unet.py new file mode 100644 index 000000000..2398cdb64 --- /dev/null +++ b/cli/load_unet.py @@ -0,0 +1,89 @@ +import torch +import diffusers + + +class StateDictStats(): + cls: str = None + device: torch.device = None + params: int = 0 + weights: dict = {} + dtypes: dict = {} + config: dict = None + + def __repr__(self): + return f'cls={self.cls} params={self.params} weights={self.weights} device={self.device} dtypes={self.dtypes} config={self.config is not None}' + + +def set_module_tensor( + module: torch.nn.Module, + name: str, + value: torch.Tensor, + stats: StateDictStats, + device: torch.device = None, + dtype: torch.dtype = None, +): + if "." in name: + splits = name.split(".") + for split in splits[:-1]: + module = getattr(module, split) + name = splits[-1] + old_value = getattr(module, name) + with torch.no_grad(): + if value.dtype not in stats.dtypes: + stats.dtypes[value.dtype] = 0 + stats.dtypes[value.dtype] += 1 + if name in module._buffers: # pylint: disable=protected-access + module._buffers[name] = value.to(device=device, dtype=dtype, non_blocking=True) # pylint: disable=protected-access + if 'buffers' not in stats.weights: + stats.weights['buffers'] = 0 + stats.weights['buffers'] += 1 + elif value is not None: + param_cls = type(module._parameters[name]) # pylint: disable=protected-access + module._parameters[name] = param_cls(value, requires_grad=old_value.requires_grad).to(device, dtype=dtype, non_blocking=True) # pylint: disable=protected-access + if 'parameters' not in stats.weights: + stats.weights['parameters'] = 0 + stats.weights['parameters'] += 1 + + +def load_unet(config_file: str, state_dict: dict, device: torch.device = None, dtype: torch.dtype = None): + # same can be done for other modules or even for entire model by loading model config and then walking through its modules + from accelerate import init_empty_weights + with init_empty_weights(): + stats = StateDictStats() + stats.device = device + stats.config = diffusers.UNet2DConditionModel.load_config(config_file) + unet = diffusers.UNet2DConditionModel.from_config(stats.config) + stats.cls = unet.__class__.__name__ + expected_state_dict_keys = list(unet.state_dict().keys()) + stats.weights['expected'] = len(expected_state_dict_keys) + for param_name, param in state_dict.items(): + if param_name not in expected_state_dict_keys: + if 'unknown' not in stats.weights: + stats.weights['unknown'] = 0 + stats.weights['unknown'] += 1 + continue + set_module_tensor(unet, name=param_name, value=param, device=device, dtype=dtype, stats=stats) + state_dict[param_name] = None # unload as we initialize the model so we dont consume double the memory + stats.params = sum(p.numel() for p in unet.parameters(recurse=True)) + return unet, stats + + +def load_safetensors(fn: str): + import safetensors.torch + state_dict = safetensors.torch.load_file(fn, device='cpu') # state dict should always be loaded to cpu + return state_dict + + +if __name__ == "__main__": + # need pipe already present to load unet state_dict into or we could load unet first and then manually create pipe with params + pipe = diffusers.StableDiffusionXLPipeline.from_single_file('/mnt/models/stable-diffusion/sdxl/TempestV0.1-Artistic.safetensors', cache_dir='/mnt/models/huggingface') + # this could be kept in memory so we dont have to reload it + dct = load_safetensors('/mnt/models/UNET/dpo-sdxl-text2image.safetensors') + pipe.unet, s = load_unet( + config_file = 'configs/sdxl/unet/config.json', # can also point to online hf model with subfolder + state_dict = dct, + device = torch.device('cpu'), # can leave out to use default device + dtype = torch.bfloat16, # can leave out to use default dtype, especially for mixed precision modules + ) + from rich import print as rprint + rprint(f'Stats: {s}') diff --git a/installer.py b/installer.py index 66602ca0b..0c25f0446 100644 --- a/installer.py +++ b/installer.py @@ -212,7 +212,10 @@ def installed(package, friendly: str = None, reload = False, quiet = False): pkgs = [p for p in package.split() if not p.startswith('-') and not p.startswith('=')] pkgs = [p.split('/')[-1] for p in pkgs] # get only package name if installing from url for pkg in pkgs: - if '>=' in pkg: + if '!=' in pkg: + p = pkg.split('!=') + return True # check for not equal always return true + elif '>=' in pkg: p = pkg.split('>=') else: p = pkg.split('==') @@ -485,7 +488,8 @@ def check_torchao(): def install_cuda(): log.info('CUDA: nVidia toolkit detected') install('onnxruntime-gpu', 'onnxruntime-gpu', ignore=True, quiet=True) - return os.environ.get('TORCH_COMMAND', 'torch torchvision --index-url https://download.pytorch.org/whl/cu124') + # return os.environ.get('TORCH_COMMAND', 'torch torchvision --index-url https://download.pytorch.org/whl/cu124') + return os.environ.get('TORCH_COMMAND', 'torch==2.4.1 torchvision==0.19.1 --index-url https://download.pytorch.org/whl/cu124') def install_rocm_zluda(): @@ -566,8 +570,11 @@ def install_rocm_zluda(): log.info('Using CPU-only torch') torch_command = os.environ.get('TORCH_COMMAND', 'torch torchvision') else: - if rocm.version is None or float(rocm.version) > 6.1: # assume the latest if version check fails - torch_command = os.environ.get('TORCH_COMMAND', 'torch torchvision --index-url https://download.pytorch.org/whl/rocm6.1') + if rocm.version is None or float(rocm.version) >= 6.1: # assume the latest if version check fails + #torch_command = os.environ.get('TORCH_COMMAND', 'torch torchvision --index-url https://download.pytorch.org/whl/rocm6.1') + torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.4.1+rocm6.1 torchvision==0.19.1+rocm6.1 --index-url https://download.pytorch.org/whl/rocm6.1') + elif rocm.version == "6.0": # lock to 2.4.1, older rocm (5.7) uses torch 2.3 + torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.4.1+rocm6.0 torchvision==0.19.1+rocm6.0 --index-url https://download.pytorch.org/whl/rocm6.0') elif float(rocm.version) < 5.5: # oldest supported version is 5.5 log.warning(f"ROCm: unsupported version={rocm.version}") log.warning("ROCm: minimum supported version=5.5") @@ -583,7 +590,7 @@ def install_rocm_zluda(): ort_package = os.environ.get('ONNXRUNTIME_PACKAGE', f"--pre onnxruntime-training{'' if ort_version is None else ('==' + ort_version)} --index-url https://pypi.lsh.sh/{rocm.version[0]}{rocm.version[2]} --extra-index-url https://pypi.org/simple") install(ort_package, 'onnxruntime-training') - if device is not None: + if installed("torch") and device is not None: if 'Flash attention' in opts.get('sdp_options'): if not installed('flash-attn'): install(rocm.get_flash_attention_command(device), reinstall=True) @@ -616,26 +623,10 @@ def install_ipex(torch_command): os.environ.setdefault('ClDeviceGlobalMemSizeAvailablePercent', '100') if "linux" in sys.platform: torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.3.1+cxx11.abi torchvision==0.18.1+cxx11.abi intel-extension-for-pytorch==2.3.110+xpu oneccl_bind_pt==2.3.100+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/') + # torch_command = os.environ.get('TORCH_COMMAND', 'torch torchvision --index-url https://download.pytorch.org/whl/test/xpu') # test wheels are stable previews, significantly slower than IPEX # os.environ.setdefault('TENSORFLOW_PACKAGE', 'tensorflow==2.15.1 intel-extension-for-tensorflow[xpu]==2.15.0.1') else: - if sys.version_info.minor == 11: - pytorch_pip = 'https://github.com/Nuullll/intel-extension-for-pytorch/releases/download/v2.1.10%2Bxpu/torch-2.1.0a0+cxx11.abi-cp311-cp311-win_amd64.whl' - torchvision_pip = 'https://github.com/Nuullll/intel-extension-for-pytorch/releases/download/v2.1.10%2Bxpu/torchvision-0.16.0a0+cxx11.abi-cp311-cp311-win_amd64.whl' - ipex_pip = 'https://github.com/Nuullll/intel-extension-for-pytorch/releases/download/v2.1.10%2Bxpu/intel_extension_for_pytorch-2.1.10+xpu-cp311-cp311-win_amd64.whl' - torch_command = os.environ.get('TORCH_COMMAND', f'{pytorch_pip} {torchvision_pip} {ipex_pip}') - elif sys.version_info.minor == 10: - pytorch_pip = 'https://github.com/Nuullll/intel-extension-for-pytorch/releases/download/v2.1.10%2Bxpu/torch-2.1.0a0+cxx11.abi-cp310-cp310-win_amd64.whl' - torchvision_pip = 'https://github.com/Nuullll/intel-extension-for-pytorch/releases/download/v2.1.10%2Bxpu/torchvision-0.16.0a0+cxx11.abi-cp310-cp310-win_amd64.whl' - ipex_pip = 'https://github.com/Nuullll/intel-extension-for-pytorch/releases/download/v2.1.10%2Bxpu/intel_extension_for_pytorch-2.1.10+xpu-cp310-cp310-win_amd64.whl' - torch_command = os.environ.get('TORCH_COMMAND', f'{pytorch_pip} {torchvision_pip} {ipex_pip}') - else: - torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.1.0.post3 torchvision==0.16.0.post3 intel-extension-for-pytorch==2.1.40+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/') - if os.environ.get('DISABLE_VENV_LIBS', None) is None: - install(os.environ.get('MKL_PACKAGE', 'mkl==2024.2.0'), 'mkl') - install(os.environ.get('DPCPP_PACKAGE', 'mkl-dpcpp==2024.2.0'), 'mkl-dpcpp') - install(os.environ.get('ONECCL_PACKAGE', 'oneccl-devel==2021.13.0'), 'oneccl-devel') - install(os.environ.get('MPI_PACKAGE', 'impi-devel==2021.13.0'), 'impi-devel') - torch_command = os.environ.get('TORCH_COMMAND', f'{pytorch_pip} {torchvision_pip} {ipex_pip}') + torch_command = os.environ.get('TORCH_COMMAND', '--pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/xpu') # torchvision doesn't exist on test/stable branch for windows install(os.environ.get('OPENVINO_PACKAGE', 'openvino==2024.3.0'), 'openvino', ignore=True) install('nncf==2.7.0', 'nncf', ignore=True) install(os.environ.get('ONNXRUNTIME_PACKAGE', 'onnxruntime-openvino'), 'onnxruntime-openvino', ignore=True) @@ -697,7 +688,7 @@ def check_torch(): allow_ipex = not (args.use_cuda or args.use_rocm or args.use_directml or args.use_openvino) allow_directml = not (args.use_cuda or args.use_rocm or args.use_ipex or args.use_openvino) allow_openvino = not (args.use_cuda or args.use_rocm or args.use_ipex or args.use_directml) - log.debug(f'Torch overrides: cuda={args.use_cuda} rocm={args.use_rocm} ipex={args.use_ipex} diml={args.use_directml} openvino={args.use_openvino} zluda={args.use_zluda}') + log.debug(f'Torch overrides: cuda={args.use_cuda} rocm={args.use_rocm} ipex={args.use_ipex} directml={args.use_directml} openvino={args.use_openvino} zluda={args.use_zluda}') # log.debug(f'Torch allowed: cuda={allow_cuda} rocm={allow_rocm} ipex={allow_ipex} diml={allow_directml} openvino={allow_openvino}') torch_command = os.environ.get('TORCH_COMMAND', '') @@ -1038,6 +1029,8 @@ def set_environment(): os.environ.setdefault('UVICORN_TIMEOUT_KEEP_ALIVE', '60') os.environ.setdefault('KINETO_LOG_LEVEL', '3') os.environ.setdefault('DO_NOT_TRACK', '1') + os.environ.setdefault('UV_INDEX_STRATEGY', 'unsafe-any-match') + os.environ.setdefault('UV_NO_BUILD_ISOLATION', '1') os.environ.setdefault('HF_HUB_CACHE', opts.get('hfcache_dir', os.path.join(os.path.expanduser('~'), '.cache', 'huggingface', 'hub'))) allocator = f'garbage_collection_threshold:{opts.get("torch_gc_threshold", 80)/100:0.2f},max_split_size_mb:512' if opts.get("torch_malloc", "native") == 'cudaMallocAsync': diff --git a/javascript/sdnext.css b/javascript/sdnext.css index 0f3765694..3c81e7d8e 100644 --- a/javascript/sdnext.css +++ b/javascript/sdnext.css @@ -55,6 +55,8 @@ td > div > span { overflow-y: auto; max-height: 3em; overflow-x: hidden; } .gradio-radio { padding: 0 !important; width: max-content !important; } .gradio-slider { margin-right: var(--spacing-sm) !important; width: max-content !important } .gradio-slider input[type="number"] { width: 5em; font-size: var(--text-xs); height: 16px; text-align: right; padding: 0; } +.gradio-checkboxgroup { padding: 0 !important; } +.gradio-checkbox > label { color: var(--block-title-text-color) !important; } /* custom gradio elements */ .accordion-compact { padding: 8px 0px 4px 0px !important; } diff --git a/modules/apg/__init__.py b/modules/apg/__init__.py index 0bf6f1c8d..1dc7a619e 100644 --- a/modules/apg/__init__.py +++ b/modules/apg/__init__.py @@ -27,12 +27,15 @@ def project( v0: torch.Tensor, # [B, C, H, W] v1: torch.Tensor, # [B, C, H, W] ): + device = v0.device dtype = v0.dtype + if device.type == "xpu": + v0, v1 = v0.to("cpu"), v1.to("cpu") v0, v1 = v0.double(), v1.double() v1 = torch.nn.functional.normalize(v1, dim=[-1, -2, -3]) v0_parallel = (v0 * v1).sum(dim=[-1, -2, -3], keepdim=True) * v1 v0_orthogonal = v0 - v0_parallel - return v0_parallel.to(dtype), v0_orthogonal.to(dtype) + return v0_parallel.to(device, dtype=dtype), v0_orthogonal.to(device, dtype=dtype) def normalized_guidance( diff --git a/modules/devices.py b/modules/devices.py index f69d19f53..17e6c8f0f 100644 --- a/modules/devices.py +++ b/modules/devices.py @@ -291,6 +291,7 @@ def set_cudnn_params(): torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = True + torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True) except Exception: pass if torch.backends.cudnn.is_available(): diff --git a/modules/intel/ipex/__init__.py b/modules/intel/ipex/__init__.py index b84a853ed..e1c476e7e 100644 --- a/modules/intel/ipex/__init__.py +++ b/modules/intel/ipex/__init__.py @@ -16,6 +16,13 @@ def ipex_init(): # pylint: disable=too-many-statements if hasattr(torch, "cuda") and hasattr(torch.cuda, "is_xpu_hijacked") and torch.cuda.is_xpu_hijacked: return True, "Skipping IPEX hijack" else: + try: # force xpu device on torch compile and triton + torch._inductor.utils.GPU_TYPES = ["xpu"] + torch._inductor.utils.get_gpu_type = lambda *args, **kwargs: "xpu" + from triton import backends as triton_backends # pylint: disable=import-error + triton_backends.backends["nvidia"].driver.is_active = lambda *args, **kwargs: False + except Exception: + pass # Replace cuda with xpu: torch.cuda.current_device = torch.xpu.current_device torch.cuda.current_stream = torch.xpu.current_stream @@ -115,26 +122,26 @@ def ipex_init(): # pylint: disable=too-many-statements torch.cuda.traceback = torch.xpu.traceback # Memory: - if 'linux' in sys.platform and "WSL2" in os.popen("uname -a").read(): + if legacy and 'linux' in sys.platform and "WSL2" in os.popen("uname -a").read(): torch.xpu.empty_cache = lambda: None torch.cuda.empty_cache = torch.xpu.empty_cache if legacy: - torch.cuda.memory = torch.xpu.memory - torch.cuda.memory_stats = torch.xpu.memory_stats torch.cuda.memory_summary = torch.xpu.memory_summary torch.cuda.memory_snapshot = torch.xpu.memory_snapshot - torch.cuda.memory_allocated = torch.xpu.memory_allocated - torch.cuda.max_memory_allocated = torch.xpu.max_memory_allocated - torch.cuda.memory_reserved = torch.xpu.memory_reserved - torch.cuda.memory_cached = torch.xpu.memory_reserved - torch.cuda.max_memory_reserved = torch.xpu.max_memory_reserved - torch.cuda.max_memory_cached = torch.xpu.max_memory_reserved - torch.cuda.reset_peak_memory_stats = torch.xpu.reset_peak_memory_stats - torch.cuda.reset_max_memory_cached = torch.xpu.reset_peak_memory_stats - torch.cuda.reset_max_memory_allocated = torch.xpu.reset_peak_memory_stats - torch.cuda.memory_stats_as_nested_dict = torch.xpu.memory_stats_as_nested_dict - torch.cuda.reset_accumulated_memory_stats = torch.xpu.reset_accumulated_memory_stats + torch.cuda.memory = torch.xpu.memory + torch.cuda.memory_stats = torch.xpu.memory_stats + torch.cuda.memory_allocated = torch.xpu.memory_allocated + torch.cuda.max_memory_allocated = torch.xpu.max_memory_allocated + torch.cuda.memory_reserved = torch.xpu.memory_reserved + torch.cuda.memory_cached = torch.xpu.memory_reserved + torch.cuda.max_memory_reserved = torch.xpu.max_memory_reserved + torch.cuda.max_memory_cached = torch.xpu.max_memory_reserved + torch.cuda.reset_peak_memory_stats = torch.xpu.reset_peak_memory_stats + torch.cuda.reset_max_memory_cached = torch.xpu.reset_peak_memory_stats + torch.cuda.reset_max_memory_allocated = torch.xpu.reset_peak_memory_stats + torch.cuda.memory_stats_as_nested_dict = torch.xpu.memory_stats_as_nested_dict + torch.cuda.reset_accumulated_memory_stats = torch.xpu.reset_accumulated_memory_stats # RNG: torch.cuda.get_rng_state = torch.xpu.get_rng_state @@ -183,7 +190,8 @@ def ipex_init(): # pylint: disable=too-many-statements torch._C._XpuDeviceProperties.minor = 1 # Fix functions with ipex: - torch.cuda.mem_get_info = lambda device=None: [(torch.xpu.get_device_properties(device).total_memory - torch.xpu.memory_reserved(device)), torch.xpu.get_device_properties(device).total_memory] + torch.xpu.mem_get_info = lambda device=None: [(torch.xpu.get_device_properties(device).total_memory - torch.xpu.memory_reserved(device)), torch.xpu.get_device_properties(device).total_memory] + torch.cuda.mem_get_info = torch.xpu.mem_get_info torch._utils._get_available_device_type = lambda: "xpu" torch.has_cuda = True torch.cuda.has_half = True @@ -192,13 +200,13 @@ def ipex_init(): # pylint: disable=too-many-statements torch.backends.cuda.is_built = lambda *args, **kwargs: True torch.version.cuda = "12.1" torch.cuda.get_arch_list = lambda: ["ats-m150", "pvc"] - torch.cuda.get_device_capability = lambda *args, **kwargs: [12,1] + torch.cuda.get_device_capability = lambda *args, **kwargs: (12,1) torch.cuda.get_device_properties.major = 12 torch.cuda.get_device_properties.minor = 1 torch.cuda.ipc_collect = lambda *args, **kwargs: None torch.cuda.utilization = lambda *args, **kwargs: 0 - ipex_hijacks() + ipex_hijacks(legacy=legacy) try: from .diffusers import ipex_diffusers ipex_diffusers() diff --git a/modules/intel/ipex/hijacks.py b/modules/intel/ipex/hijacks.py index bb79053c4..7ec94138d 100644 --- a/modules/intel/ipex/hijacks.py +++ b/modules/intel/ipex/hijacks.py @@ -293,7 +293,9 @@ def torch_load(f, map_location=None, *args, **kwargs): # Hijack Functions: -def ipex_hijacks(): +def ipex_hijacks(legacy=True): + if legacy: + torch.nn.functional.interpolate = interpolate torch.tensor = torch_tensor torch.Tensor.to = Tensor_to torch.Tensor.cuda = Tensor_cuda @@ -319,7 +321,6 @@ def ipex_hijacks(): torch.nn.functional.layer_norm = functional_layer_norm torch.nn.functional.linear = functional_linear torch.nn.functional.conv2d = functional_conv2d - torch.nn.functional.interpolate = interpolate torch.nn.functional.pad = functional_pad torch.bmm = torch_bmm diff --git a/modules/interrogate.py b/modules/interrogate.py index 68c8aca00..5ae06fb90 100644 --- a/modules/interrogate.py +++ b/modules/interrogate.py @@ -252,6 +252,8 @@ def get_clip_models(): def load_interrogator(clip_model, blip_model): + from installer import install + install('clip_interrogator==0.6.0') import clip_interrogator clip_interrogator.CAPTION_MODELS = caption_models global ci # pylint: disable=global-statement diff --git a/modules/loader.py b/modules/loader.py index 18cb42893..a2970abfd 100644 --- a/modules/loader.py +++ b/modules/loader.py @@ -1,5 +1,6 @@ from __future__ import annotations from functools import partial +import os import re import sys import logging @@ -13,7 +14,11 @@ logging.getLogger("DeepSpeed").disabled = True +os.environ.setdefault('TORCH_LOGS', '-all') import torch # pylint: disable=C0411 +if torch.__version__.startswith('2.5.0'): + errors.log.warning(f'Disabling cuDNN for SDP on torch={torch.__version__}') + torch.backends.cuda.enable_cudnn_sdp(False) try: import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import errors.log.debug(f'Load IPEX=={ipex.__version__}') @@ -96,7 +101,6 @@ def get_packages(): } try: - import os import math cores = os.cpu_count() affinity = len(os.sched_getaffinity(0)) diff --git a/modules/memstats.py b/modules/memstats.py index eab4a677a..c417165a2 100644 --- a/modules/memstats.py +++ b/modules/memstats.py @@ -27,12 +27,12 @@ def gb(val: float): s = torch.cuda.mem_get_info() gpu = { 'used': gb(s[1] - s[0]), 'total': gb(s[1]) } s = dict(torch.cuda.memory_stats()) - if s['num_ooms'] > 0: + if s.get('num_ooms', 0) > 0: shared.state.oom = True mem.update({ 'gpu': gpu, - 'retries': s['num_alloc_retries'], - 'oom': s['num_ooms'] + 'retries': s.get('num_alloc_retries', 0), + 'oom': s.get('num_ooms', 0) }) return mem except Exception: diff --git a/modules/model_stablecascade.py b/modules/model_stablecascade.py index 44cea1818..444d11adc 100644 --- a/modules/model_stablecascade.py +++ b/modules/model_stablecascade.py @@ -6,7 +6,7 @@ def get_timestep_ratio_conditioning(t, alphas_cumprod): - s = torch.tensor([0.008]) # diffusers uses 0.003 while the original is 0.008 + s = torch.tensor([0.008]) clamp_range = [0, 1] min_var = torch.cos(s / (1 + s) * torch.pi * 0.5) ** 2 var = alphas_cumprod[t] @@ -133,8 +133,6 @@ def load_cascade_combined(checkpoint_info, diffusers_load_config): sd_model = StableCascadeCombinedPipeline.from_pretrained(checkpoint_info.path, cache_dir=shared.opts.diffusers_dir, **diffusers_load_config) sd_model.prior_pipe.scheduler.config.clip_sample = False - sd_model.default_scheduler = copy.deepcopy(sd_model.prior_pipe.scheduler) - sd_model.prior_pipe.get_timestep_ratio_conditioning = get_timestep_ratio_conditioning sd_model.decoder_pipe.text_encoder = sd_model.text_encoder = None # Nothing uses the decoder's text encoder sd_model.prior_pipe.image_encoder = sd_model.prior_image_encoder = None # No img2img is implemented yet sd_model.prior_pipe.feature_extractor = sd_model.prior_feature_extractor = None # No img2img is implemented yet diff --git a/modules/processing.py b/modules/processing.py index 471053f50..04350ee39 100644 --- a/modules/processing.py +++ b/modules/processing.py @@ -254,6 +254,7 @@ def process_init(p: StableDiffusionProcessing): p.all_prompts, p.all_negative_prompts = shared.prompt_styles.apply_styles_to_prompts(p.all_prompts, p.all_negative_prompts, p.styles, p.all_seeds) p.prompts = p.all_prompts[p.iteration * p.batch_size:(p.iteration+1) * p.batch_size] p.negative_prompts = p.all_negative_prompts[p.iteration * p.batch_size:(p.iteration+1) * p.batch_size] + p.prompts, _ = extra_networks.parse_prompts(p.prompts) def process_images_inner(p: StableDiffusionProcessing) -> Processed: diff --git a/modules/processing_diffusers.py b/modules/processing_diffusers.py index cd0763eb9..9b485d361 100644 --- a/modules/processing_diffusers.py +++ b/modules/processing_diffusers.py @@ -404,8 +404,10 @@ def process_diffusers(p: processing.StableDiffusionProcessing): shared.sd_model = sd_models.set_diffuser_pipe(shared.sd_model, sd_models.DiffusersTaskType.INPAINTING) # force pipeline if len(getattr(p, 'init_images', [])) == 0: p.init_images = [TF.to_pil_image(torch.rand((3, getattr(p, 'height', 512), getattr(p, 'width', 512))))] - p.prompts = p.all_prompts[p.iteration * p.batch_size:(p.iteration+1) * p.batch_size] - p.negative_prompts = p.all_negative_prompts[p.iteration * p.batch_size:(p.iteration+1) * p.batch_size] + if p.prompts is None or len(p.prompts) == 0: + p.prompts = p.all_prompts[p.iteration * p.batch_size:(p.iteration+1) * p.batch_size] + if p.negative_prompts is None or len(p.negative_prompts) == 0: + p.negative_prompts = p.all_negative_prompts[p.iteration * p.batch_size:(p.iteration+1) * p.batch_size] sd_models.move_model(shared.sd_model, devices.device) sd_models_compile.openvino_recompile_model(p, hires=False, refiner=False) # recompile if a parameter changes diff --git a/modules/sd_hijack.py b/modules/sd_hijack.py index 8ddd1f8ec..75e51f4da 100644 --- a/modules/sd_hijack.py +++ b/modules/sd_hijack.py @@ -20,7 +20,6 @@ import modules.textual_inversion.textual_inversion from modules import devices, sd_hijack_optimizations -from modules import sd_hijack_clip, sd_hijack_open_clip, sd_hijack_unet, sd_hijack_xlmr, xlmr from modules.hypernetworks import hypernetwork attention_CrossAttention_forward = ldm.modules.attention.CrossAttention.forward @@ -40,6 +39,7 @@ def apply_optimizations(): undo_optimizations() + from modules import sd_hijack_unet ldm.modules.diffusionmodules.model.nonlinearity = silu ldm.modules.diffusionmodules.openaimodel.th = sd_hijack_unet.th optimization_method = None @@ -159,6 +159,7 @@ def __init__(self): self.embedding_db.add_embedding_dir(shared.opts.embeddings_dir) def hijack(self, m): + from modules import sd_hijack_clip, sd_hijack_open_clip, sd_hijack_unet, sd_hijack_xlmr, xlmr if type(m.cond_stage_model) == xlmr.BertSeriesModelWithTransformation: model_embeddings = m.cond_stage_model.roberta.embeddings model_embeddings.token_embedding = EmbeddingsWithFixes(model_embeddings.word_embeddings, self) @@ -223,6 +224,7 @@ def flatten(el): self.layers = flatten(m) def undo_hijack(self, m): + from modules import sd_hijack_clip, sd_hijack_open_clip, xlmr if not hasattr(m, 'cond_stage_model'): return # not ldm model if type(m.cond_stage_model) == xlmr.BertSeriesModelWithTransformation: diff --git a/modules/sd_models.py b/modules/sd_models.py index ef3960fcf..47d0c5947 100644 --- a/modules/sd_models.py +++ b/modules/sd_models.py @@ -737,8 +737,8 @@ def eval_model(model, op=None, sd_model=None): # pylint: disable=unused-argument model.eval() return model sd_model = sd_models_compile.apply_compile_to_model(sd_model, eval_model, ["Model", "VAE", "Text Encoder"], op="eval") - if shared.opts.diffusers_quantization: - sd_model = sd_models_compile.dynamic_quantization(sd_model) + if len(shared.opts.torchao_quantization) > 0: + sd_model = sd_models_compile.torchao_quantization(sd_model) if shared.opts.opt_channelslast and hasattr(sd_model, 'unet'): shared.log.debug(f'Setting {op}: channels-last=True') @@ -1193,7 +1193,7 @@ def load_diffuser_file(model_type, pipeline, checkpoint_info, diffusers_load_con from diffusers.utils import import_utils import_utils._accelerate_available = False # pylint: disable=protected-access if shared.opts.diffusers_to_gpu and model_type.startswith('Stable Diffusion'): - shared.log.debug(f'Diffusers accelerate: hijack={shared.opts.diffusers_to_gpu}') + shared.log.debug(f'Diffusers accelerate: direct={shared.opts.diffusers_to_gpu}') sd_hijack_accelerate.hijack_accelerate() else: sd_hijack_accelerate.restore_accelerate() @@ -1298,7 +1298,10 @@ def load_diffuser(checkpoint_info=None, already_loaded_state_dict=None, timer=No sd_model.sd_model_hash = checkpoint_info.calculate_shorthash() # pylint: disable=attribute-defined-outside-init sd_model.sd_checkpoint_info = checkpoint_info # pylint: disable=attribute-defined-outside-init sd_model.sd_model_checkpoint = checkpoint_info.filename # pylint: disable=attribute-defined-outside-init - sd_model.default_scheduler = copy.deepcopy(sd_model.scheduler) if hasattr(sd_model, "scheduler") else None + if hasattr(sd_model, "prior_pipe"): + sd_model.default_scheduler = copy.deepcopy(sd_model.prior_pipe.scheduler) if hasattr(sd_model.prior_pipe, "scheduler") else None + else: + sd_model.default_scheduler = copy.deepcopy(sd_model.scheduler) if hasattr(sd_model, "scheduler") else None sd_model.is_sdxl = False # a1111 compatibility item sd_model.is_sd2 = hasattr(sd_model, 'cond_stage_model') and hasattr(sd_model.cond_stage_model, 'model') # a1111 compatibility item sd_model.is_sd1 = not sd_model.is_sd2 # a1111 compatibility item diff --git a/modules/sd_models_compile.py b/modules/sd_models_compile.py index ff4fa6aff..91ed84ded 100644 --- a/modules/sd_models_compile.py +++ b/modules/sd_models_compile.py @@ -183,7 +183,7 @@ def nncf_compress_model(model, op=None, sd_model=None): def nncf_compress_weights(sd_model): try: t0 = time.time() - shared.log.info(f"NNCF Compress Weights: {shared.opts.nncf_compress_weights}") + shared.log.info(f"Quantization: type=NNCF modules={shared.opts.nncf_compress_weights}") global quant_last_model_name, quant_last_model_device # pylint: disable=global-statement install('nncf==2.7.0', quiet=True) @@ -199,9 +199,9 @@ def nncf_compress_weights(sd_model): quant_last_model_device = None t1 = time.time() - shared.log.info(f"NNCF Compress Weights: time={t1-t0:.2f}") + shared.log.info(f"Quantization: type=NNCF time={t1-t0:.2f}") except Exception as e: - shared.log.warning(f"NNCF Compress Weights: error: {e}") + shared.log.warning(f"Quantization: type=NNCF {e}") return sd_model @@ -249,10 +249,10 @@ def optimum_quanto_model(model, op=None, sd_model=None, weights=None, activation def optimum_quanto_weights(sd_model): try: if shared.opts.diffusers_offload_mode in {"balanced", "sequential"}: - shared.log.warning(f"Optimum Quanto Weights is incompatible with {shared.opts.diffusers_offload_mode} offload!") + shared.log.warning(f"Quantization: type=Optimum.quanto offload={shared.opts.diffusers_offload_mode} not compatible") return sd_model t0 = time.time() - shared.log.info(f"Optimum Quanto Weights: {shared.opts.optimum_quanto_weights}") + shared.log.info(f"Quantization: type=Optimum.quanto: modules={shared.opts.optimum_quanto_weights}") global quant_last_model_name, quant_last_model_device # pylint: disable=global-statement quanto = model_quant.load_quanto() quanto.tensor.qbits.QBitsTensor.create = lambda *args, **kwargs: quanto.tensor.qbits.QBitsTensor(*args, **kwargs) @@ -299,9 +299,9 @@ def encode_prompt(*args, **kwargs): devices.torch_gc(force=True) t1 = time.time() - shared.log.info(f"Optimum Quanto Weights: time={t1-t0:.2f}") + shared.log.info(f"Quantization: type=Optimum.quanto time={t1-t0:.2f}") except Exception as e: - shared.log.warning(f"Optimum Quanto Weights: error: {e}") + shared.log.warning(f"Quantization: type=Optimum.quanto {e}") return sd_model @@ -329,7 +329,7 @@ def compile_onediff(sd_model): from onediff.infer_compiler import oneflow_compile except Exception as e: - shared.log.warning(f"Model compile using onediff/oneflow: {e}") + shared.log.warning(f"Model compile: task=onediff {e}") return sd_model try: @@ -351,9 +351,9 @@ def compile_onediff(sd_model): if shared.opts.cuda_compile_precompile: sd_model("dummy prompt") t1 = time.time() - shared.log.info(f"Model compile: task=onediff/oneflow time={t1-t0:.2f}") + shared.log.info(f"Model compile: task=onediff time={t1-t0:.2f}") except Exception as e: - shared.log.info(f"Model compile: task=onediff/oneflow error: {e}") + shared.log.info(f"Model compile: task=onediff {e}") return sd_model @@ -361,7 +361,7 @@ def compile_stablefast(sd_model): try: import sfast.compilers.stable_diffusion_pipeline_compiler as sf except Exception as e: - shared.log.warning(f'Model compile using stable-fast: {e}') + shared.log.warning(f'Model compile: task=stablefast: {e}') return sd_model config = sf.CompilationConfig.Default() try: @@ -390,9 +390,9 @@ def compile_stablefast(sd_model): if shared.opts.cuda_compile_precompile: sd_model("dummy prompt") t1 = time.time() - shared.log.info(f"Model compile: task='Stable-fast' config={config.__dict__} time={t1-t0:.2f}") + shared.log.info(f"Model compile: task=stablefast config={config.__dict__} time={t1-t0:.2f}") except Exception as e: - shared.log.info(f"Model compile: task=Stable-fast error: {e}") + shared.log.info(f"Model compile: task=stablefast {e}") return sd_model @@ -401,7 +401,7 @@ def compile_torch(sd_model): t0 = time.time() import torch._dynamo # pylint: disable=unused-import,redefined-outer-name torch._dynamo.reset() # pylint: disable=protected-access - shared.log.debug(f"Model compile available backends: {torch._dynamo.list_backends()}") # pylint: disable=protected-access + shared.log.debug(f"Model compile: task=torch backends={torch._dynamo.list_backends()}") # pylint: disable=protected-access def torch_compile_model(model, op=None, sd_model=None): # pylint: disable=unused-argument if hasattr(model, "device") and model.device.type != "meta": @@ -442,7 +442,7 @@ def torch_compile_model(model, op=None, sd_model=None): # pylint: disable=unused torch._inductor.config.use_mixed_mm = True # pylint: disable=protected-access # torch._inductor.config.force_fuse_int_mm_with_mul = True # pylint: disable=protected-access except Exception as e: - shared.log.error(f"Torch inductor config error: {e}") + shared.log.error(f"Model compile: torch inductor config error: {e}") sd_model = apply_compile_to_model(sd_model, function=torch_compile_model, options=shared.opts.cuda_compile, op="compile") @@ -450,9 +450,9 @@ def torch_compile_model(model, op=None, sd_model=None): # pylint: disable=unused if shared.opts.cuda_compile_precompile: sd_model("dummy prompt") t1 = time.time() - shared.log.info(f"Model compile: time={t1-t0:.2f}") + shared.log.info(f"Model compile: task=torch time={t1-t0:.2f}") except Exception as e: - shared.log.warning(f"Model compile error: {e}") + shared.log.warning(f"Model compile: task=torch {e}") return sd_model @@ -467,19 +467,19 @@ def check_deepcache(enable: bool): def compile_deepcache(sd_model): global deepcache_worker # pylint: disable=global-statement if not hasattr(sd_model, 'unet'): - shared.log.warning(f'Model compile using deep-cache: {sd_model.__class__} not supported') + shared.log.warning(f'Model compile: task=deepcache pipeline={sd_model.__class__} not supported') return sd_model try: from DeepCache import DeepCacheSDHelper except Exception as e: - shared.log.warning(f'Model compile using deep-cache: {e}') + shared.log.warning(f'Model compile: task=deepcache {e}') return sd_model t0 = time.time() check_deepcache(False) deepcache_worker = DeepCacheSDHelper(pipe=sd_model) deepcache_worker.set_params(cache_interval=shared.opts.deep_cache_interval, cache_branch_id=0) t1 = time.time() - shared.log.info(f"Model compile: task='DeepCache' config={deepcache_worker.params} time={t1-t0:.2f}") + shared.log.info(f"Model compile: task=deepcache config={deepcache_worker.params} time={t1-t0:.2f}") # config={'cache_interval': 3, 'cache_layer_id': 0, 'cache_block_id': 0, 'skip_mode': 'uniform'} time=0.00 return sd_model @@ -503,40 +503,56 @@ def compile_diffusers(sd_model): return sd_model -def dynamic_quantization(sd_model): +def torchao_quantization(sd_model): try: install('torchao', quiet=True) - from torchao.quantization import autoquant + from torchao import quantization as q except Exception as e: - shared.log.error(f"Model dynamic quantization not supported: {e}") + shared.log.error(f"Quantization: type=TorchAO quantization not supported: {e}") return sd_model - - """ - from torchao.quantization import quant_api - def dynamic_quant_filter_fn(mod, *args): # pylint: disable=unused-argument - return (isinstance(mod, torch.nn.Linear) and mod.in_features > 16 and (mod.in_features, mod.out_features) - not in [(1280, 640), (1920, 1280), (1920, 640), (2048, 1280), (2048, 2560), (2560, 1280), (256, 128), (2816, 1280), (320, 640), (512, 1536), (512, 256), (512, 512), (640, 1280), (640, 1920), (640, 320), (640, 5120), (640, 640), (960, 320), (960, 640)]) - - def conv_filter_fn(mod, *args): # pylint: disable=unused-argument - return (isinstance(mod, torch.nn.Conv2d) and mod.kernel_size == (1, 1) and 128 in [mod.in_channels, mod.out_channels]) - - quant_api.swap_conv2d_1x1_to_linear(sd_model.unet, conv_filter_fn) - quant_api.swap_conv2d_1x1_to_linear(sd_model.vae, conv_filter_fn) - quant_api.apply_dynamic_quant(sd_model.unet, dynamic_quant_filter_fn) - quant_api.apply_dynamic_quant(sd_model.vae, dynamic_quant_filter_fn) - """ - - shared.log.info(f"Model dynamic quantization: pipeline={sd_model.__class__.__name__}") + if shared.opts.torchao_quantization_type == "int8+act": + fn = q.int8_dynamic_activation_int8_weight + elif shared.opts.torchao_quantization_type == "int8": + fn = q.int8_weight_only + elif shared.opts.torchao_quantization_type == "int4": + fn = q.int4_weight_only + elif shared.opts.torchao_quantization_type == "fp8+act": + fn = q.float8_dynamic_activation_float8_weight + elif shared.opts.torchao_quantization_type == "fp8": + fn = q.float8_weight_only + elif shared.opts.torchao_quantization_type == "fpx": + fn = q.fpx_weight_only + else: + shared.log.error(f"Quantization: type=TorchAO type={shared.opts.torchao_quantization_type} not supported") + return sd_model + shared.log.info(f"Quantization: type=TorchAO pipe={sd_model.__class__.__name__} quant={shared.opts.torchao_quantization_type} fn={fn} targets={shared.opts.torchao_quantization}") try: - if shared.sd_model_type == 'sd' or shared.sd_model_type == 'sdxl': - sd_model.unet = sd_model.unet.to(devices.device) - sd_model.unet = autoquant(sd_model.unet, error_on_unseen=False) - elif shared.sd_model_type == 'f1': - sd_model.transformer = autoquant(sd_model.transformer, error_on_unseen=False) - else: - shared.log.error(f"Model dynamic quantization not supported: {shared.sd_model_type}") + t0 = time.time() + modules = [] + if hasattr(sd_model, 'unet') and 'Model' in shared.opts.torchao_quantization: + modules.append('unet') + q.quantize_(sd_model.unet, fn(), device=devices.device) + if hasattr(sd_model, 'transformer') and 'Model' in shared.opts.torchao_quantization: + modules.append('transformer') + q.quantize_(sd_model.transformer, fn(), device=devices.device) + # sd_model.transformer = q.autoquant(sd_model.transformer, error_on_unseen=False) + if hasattr(sd_model, 'vae') and 'VAE' in shared.opts.torchao_quantization: + modules.append('vae') + q.quantize_(sd_model.vae, fn(), device=devices.device) + if hasattr(sd_model, 'text_encoder') and 'Text Encoder' in shared.opts.torchao_quantization: + modules.append('te1') + q.quantize_(sd_model.text_encoder, fn(), device=devices.device) + if hasattr(sd_model, 'text_encoder_2') and 'Text Encoder' in shared.opts.torchao_quantization: + modules.append('te2') + q.quantize_(sd_model.text_encoder_2, fn(), device=devices.device) + if hasattr(sd_model, 'text_encoder_3') and 'Text Encoder' in shared.opts.torchao_quantization: + modules.append('te3') + q.quantize_(sd_model.text_encoder_3, fn(), device=devices.device) + t1 = time.time() + shared.log.info(f"Quantization: type=TorchAO modules={modules} time={t1-t0:.2f}") except Exception as e: - shared.log.error(f"Model dynamic quantization: {e}") + shared.log.error(f"Quantization: type=TorchAO {e}") + setup_logging() # torchao uses dynamo which messes with logging so reset is needed return sd_model diff --git a/modules/shared.py b/modules/shared.py index 6f7c14d75..69574ff92 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -492,11 +492,12 @@ def get_default_modes(): "quant_sep": OptionInfo("

Model Quantization

", "", gr.HTML, {"visible": native}), "quant_shuffle_weights": OptionInfo(False, "Shuffle the weights between GPU and CPU when quantizing", gr.Checkbox, {"visible": native}), - "diffusers_quantization": OptionInfo(False, "Dynamic quantization with TorchAO", gr.Checkbox, {"visible": native}), - "nncf_compress_weights": OptionInfo([], "Compress Model weights with NNCF INT8", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder", "ControlNet"], "visible": native}), - "optimum_quanto_weights": OptionInfo([], "Quantize Model weights with Optimum Quanto", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder", "ControlNet"], "visible": native}), - "optimum_quanto_weights_type": OptionInfo("qint8", "Weights type for Optimum Quanto", gr.Radio, {"choices": ['qint8', 'qfloat8_e4m3fn', 'qfloat8_e5m2', 'qint4', 'qint2'], "visible": native}), - "optimum_quanto_activations_type": OptionInfo("none", "Activations type for Optimum Quanto", gr.Radio, {"choices": ['none', 'qint8', 'qfloat8_e4m3fn', 'qfloat8_e5m2'], "visible": native}), + "nncf_compress_weights": OptionInfo([], "NNCF int8 compression enabled", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder", "ControlNet"], "visible": native}), + "optimum_quanto_weights": OptionInfo([], "Optimum.quanto quantization enabled", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder", "ControlNet"], "visible": native}), + "optimum_quanto_weights_type": OptionInfo("qint8", "Optimum.quanto quantization type", gr.Radio, {"choices": ['qint8', 'qfloat8_e4m3fn', 'qfloat8_e5m2', 'qint4', 'qint2'], "visible": native}), + "optimum_quanto_activations_type": OptionInfo("none", "Optimum.quanto quantization activations ", gr.Radio, {"choices": ['none', 'qint8', 'qfloat8_e4m3fn', 'qfloat8_e5m2'], "visible": native}), + "torchao_quantization": OptionInfo([], "TorchAO quantization enabled", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder"], "visible": native}), + "torchao_quantization_type": OptionInfo("int8", "TorchAO quantization type", gr.Radio, {"choices": ["int8+act", "int8", "int4", "fp8+act", "fp8", "fpx"], "visible": native}), "ipex_sep": OptionInfo("

IPEX

", "", gr.HTML, {"visible": devices.backend == "ipex"}), "ipex_optimize": OptionInfo([], "IPEX Optimize for Intel GPUs", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder", "Upscaler"], "visible": devices.backend == "ipex"}), @@ -712,6 +713,7 @@ def get_default_modes(): "theme_type": OptionInfo("Standard", "Theme type", gr.Radio, {"choices": ["Modern", "Standard", "None"]}), "theme_style": OptionInfo("Auto", "Theme mode", gr.Radio, {"choices": ["Auto", "Dark", "Light"]}), "gradio_theme": OptionInfo("black-teal", "UI theme", gr.Dropdown, lambda: {"choices": theme.list_themes()}, refresh=theme.refresh_themes), + "autolaunch": OptionInfo(False, "Autolaunch browser upon startup"), "font_size": OptionInfo(14, "Font size", gr.Slider, {"minimum": 8, "maximum": 32, "step": 1, "visible": True}), "tooltips": OptionInfo("UI Tooltips", "UI tooltips", gr.Radio, {"choices": ["None", "Browser default", "UI tooltips"], "visible": False}), "aspect_ratios": OptionInfo("1:1, 4:3, 3:2, 16:9, 16:10, 21:9, 2:3, 3:4, 9:16, 10:16, 9:21", "Allowed aspect ratios"), diff --git a/requirements.txt b/requirements.txt index 4999331ca..374451a57 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,6 @@ patch-ng anyio addict astunparse -clean-fid filetype future GitPython @@ -12,7 +11,6 @@ inflection jsonmerge kornia lark -lpips omegaconf optimum piexif @@ -35,8 +33,6 @@ peft==0.13.1 httpx==0.24.1 compel==2.0.3 torchsde==0.2.6 -open-clip-torch -clip-interrogator==0.6.0 antlr4-python3-runtime==4.9.3 requests==2.32.3 tqdm==4.66.5 @@ -65,3 +61,7 @@ torchdiffeq dctorch scikit-image seam-carving +open-clip-torch + +# TODO temporary block for torch==2.5.0 +torchvision!=0.20.0 diff --git a/webui.py b/webui.py index f0f29e13a..32e7ed75b 100644 --- a/webui.py +++ b/webui.py @@ -350,7 +350,7 @@ def webui(restart=False): continue logger.handlers = log.handlers # autolaunch only on initial start - if cmd_opts.autolaunch and local_url is not None: + if (shared.opts.autolaunch or cmd_opts.autolaunch) and local_url is not None: cmd_opts.autolaunch = False shared.log.info('Launching browser') import webbrowser