Merge branch 'dev' into lora-extract

vladmandic · Oct 20, 2024 · 3ed48ec · 3ed48ec
2 parents 2744269 + 13a2e6c
commit 3ed48ec
Show file tree

Hide file tree

Showing 20 changed files with 266 additions and 119 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,15 +1,20 @@
 # Change Log for SD.Next
 
-## Update for 2024-10-17
+## Update for 2024-10-18
 
-### Highlights for 2024-10-17
+### Highlights for 2024-10-18
+
+Workflow highlights:
 
 - **Reprocess**: New workflow options that allow you to generate at lower quality and then  
   reprocess at higher quality for select images only or generate without hires/refine and then reprocess with hires/refine  
   and you can pick any previous latent from auto-captured history!  
 - **Detailer** Fully built-in detailer workflow without with support for all standard models  
 - Built-in **model analyzer**  
   See all details of your currently loaded model, including components, parameter count, layer count, etc.  
+
+Newly supported:
+
 - New fine-tuned [CLiP-ViT-L]((https://huggingface.co/zer0int/CLIP-GmP-ViT-L-14)) 1st stage **text-encoders** used by SD15, SDXL, Flux.1, etc. brings additional details to your images  
 - New models:
   - [CogView 3 Plus](https://huggingface.co/THUDM/CogView3-Plus-3B)  
@@ -18,10 +23,15 @@
   [Ctrl+X](https://github.com/genforce/ctrl-x) which allows for control of **structure and appearance** without the need for extra models,  
   [APG: Adaptive Projected Guidance](https://arxiv.org/pdf/2410.02416) for optimal **guidance** control,  
   [LinFusion](https://github.com/Huage001/LinFusion) for on-the-fly distillation of any sd15/sdxl model  
+
+Otherwise notable:
+
 - Several of [Flux.1](https://huggingface.co/black-forest-labs/FLUX.1-dev) optimizations and new quantization types  
 - Auto-detection of best available **device/dtype** settings for your platform and GPU reduces neeed for manual configuration  
 - Full rewrite of **sampler options**, not far more streamlined with tons of new options to tweak scheduler behavior  
 - Improved **LoRA** detection and handling for all supported models  
+- Tons of work on dynamic quantization that can be applied on-the-fly during model load to any model type  
+  Supported quantization engines include TorchAO, Optimum.quanto, NNCF compression, and more...  
 
 Oh, and we've compiled a full table with list of popular text-to-image generative models, their respective parameters and architecture overview: <https://github.com/vladmandic/automatic/wiki/Models>
 
@@ -30,7 +40,7 @@ And there are also other goodies like multiple *XYZ grid* improvements, addition
 [README](https://github.com/vladmandic/automatic/blob/master/README.md) | [CHANGELOG](https://github.com/vladmandic/automatic/blob/master/CHANGELOG.md) | [WiKi](https://github.com/vladmandic/automatic/wiki) | [Discord](https://discord.com/invite/sd-next-federal-batch-inspectors-1101998836328697867)
 
 
-### Details for 2024-10-17
+### Details for 2024-10-18
 
 - **reprocess**
   - new top-level button: reprocess latent from your history of generated image(s)  
@@ -211,6 +221,11 @@ And there are also other goodies like multiple *XYZ grid* improvements, addition
   - setting `lora_load_gpu` to load LoRA directly to GPU  
     *default*: true unless lovwram  
 
+- **torchao**
+  - reimplement torchao quantization
+  - configure in settings -> compute settings -> quantization
+  - can be applied to any model on-the-fly during load  
+
 - **huggingface**:  
   - force logout/login on token change  
   - unified handling of cache folder: set via `HF_HUB` or `HF_HUB_CACHE` or via settings -> system paths  
@@ -219,6 +234,9 @@ And there are also other goodies like multiple *XYZ grid* improvements, addition
   - add support for *image2video* (in addition to previous *text2video* and *video2video*)  
   - *note*: *image2video* requires separate 5b model variant  
 
+- **torch**  
+  - due to numerous issues with torch 2.5.0 which was just released as stable, we are sticking with 2.4.1 for now  
+
 - **backend=original** is now marked as in maintenance-only mode  
 - **python 3.12** improved compatibility, automatically handle `setuptools`  
 - **control**
@@ -233,10 +251,12 @@ And there are also other goodies like multiple *XYZ grid* improvements, addition
   - fix update infotext on image select  
   - fix imageviewer exif parser  
   - selectable info view in image viewer, thanks @ZeldaMaster501  
+  - setting to enable browser autolaunch, thanks @brknsoul  
 - **free-u** check if device/dtype are fft compatible and cast as necessary  
 - **rocm**
   - additional gpu detection and auto-config code, thanks @lshqqytiger  
   - experimental triton backend for flash attention, thanks @lshqqytiger  
+  - update to rocm 6.2, thanks @Disty0
 - **directml**  
   - update `torch` to 2.4.1, thanks @lshqqytiger  
 - **extensions**  

diff --git a/cli/load_unet.py b/cli/load_unet.py
@@ -0,0 +1,89 @@
+import torch
+import diffusers
+
+
+class StateDictStats():
+    cls: str = None
+    device: torch.device = None
+    params: int = 0
+    weights: dict = {}
+    dtypes: dict = {}
+    config: dict = None
+
+    def __repr__(self):
+        return f'cls={self.cls} params={self.params} weights={self.weights} device={self.device} dtypes={self.dtypes} config={self.config is not None}'
+
+
+def set_module_tensor(
+    module: torch.nn.Module,
+    name: str,
+    value: torch.Tensor,
+    stats: StateDictStats,
+    device: torch.device = None,
+    dtype: torch.dtype = None,
+):
+    if "." in name:
+        splits = name.split(".")
+        for split in splits[:-1]:
+            module = getattr(module, split)
+        name = splits[-1]
+    old_value = getattr(module, name)
+    with torch.no_grad():
+        if value.dtype not in stats.dtypes:
+            stats.dtypes[value.dtype] = 0
+        stats.dtypes[value.dtype] += 1
+        if name in module._buffers: # pylint: disable=protected-access
+            module._buffers[name] = value.to(device=device, dtype=dtype, non_blocking=True) # pylint: disable=protected-access
+            if 'buffers' not in stats.weights:
+                stats.weights['buffers'] = 0
+            stats.weights['buffers'] += 1
+        elif value is not None:
+            param_cls = type(module._parameters[name]) # pylint: disable=protected-access
+            module._parameters[name] = param_cls(value, requires_grad=old_value.requires_grad).to(device, dtype=dtype, non_blocking=True) # pylint: disable=protected-access
+            if 'parameters' not in stats.weights:
+                stats.weights['parameters'] = 0
+            stats.weights['parameters'] += 1
+
+
+def load_unet(config_file: str, state_dict: dict, device: torch.device = None, dtype: torch.dtype = None):
+    # same can be done for other modules or even for entire model by loading model config and then walking through its modules
+    from accelerate import init_empty_weights
+    with init_empty_weights():
+        stats = StateDictStats()
+        stats.device = device
+        stats.config = diffusers.UNet2DConditionModel.load_config(config_file)
+        unet = diffusers.UNet2DConditionModel.from_config(stats.config)
+        stats.cls = unet.__class__.__name__
+        expected_state_dict_keys = list(unet.state_dict().keys())
+        stats.weights['expected'] = len(expected_state_dict_keys)
+    for param_name, param in state_dict.items():
+        if param_name not in expected_state_dict_keys:
+            if 'unknown' not in stats.weights:
+                stats.weights['unknown'] = 0
+            stats.weights['unknown'] += 1
+            continue
+        set_module_tensor(unet, name=param_name, value=param, device=device, dtype=dtype, stats=stats)
+        state_dict[param_name] = None # unload as we initialize the model so we dont consume double the memory
+    stats.params = sum(p.numel() for p in unet.parameters(recurse=True))
+    return unet, stats
+
+
+def load_safetensors(fn: str):
+    import safetensors.torch
+    state_dict = safetensors.torch.load_file(fn, device='cpu') # state dict should always be loaded to cpu
+    return state_dict
+
+
+if __name__ == "__main__":
+    # need pipe already present to load unet state_dict into or we could load unet first and then manually create pipe with params
+    pipe = diffusers.StableDiffusionXLPipeline.from_single_file('/mnt/models/stable-diffusion/sdxl/TempestV0.1-Artistic.safetensors', cache_dir='/mnt/models/huggingface')
+    # this could be kept in memory so we dont have to reload it
+    dct = load_safetensors('/mnt/models/UNET/dpo-sdxl-text2image.safetensors')
+    pipe.unet, s = load_unet(
+        config_file = 'configs/sdxl/unet/config.json', # can also point to online hf model with subfolder
+        state_dict = dct,
+        device = torch.device('cpu'), # can leave out to use default device
+        dtype = torch.bfloat16, # can leave out to use default dtype, especially for mixed precision modules
+    )
+    from rich import print as rprint
+    rprint(f'Stats: {s}')
diff --git a/installer.py b/installer.py
@@ -212,7 +212,10 @@ def installed(package, friendly: str = None, reload = False, quiet = False):
             pkgs = [p for p in package.split() if not p.startswith('-') and not p.startswith('=')]
             pkgs = [p.split('/')[-1] for p in pkgs] # get only package name if installing from url
         for pkg in pkgs:
-            if '>=' in pkg:
+            if '!=' in pkg:
+                p = pkg.split('!=')
+                return True # check for not equal always return true
+            elif '>=' in pkg:
                 p = pkg.split('>=')
             else:
                 p = pkg.split('==')
@@ -485,7 +488,8 @@ def check_torchao():
 def install_cuda():
     log.info('CUDA: nVidia toolkit detected')
     install('onnxruntime-gpu', 'onnxruntime-gpu', ignore=True, quiet=True)
-    return os.environ.get('TORCH_COMMAND', 'torch torchvision --index-url https://download.pytorch.org/whl/cu124')
+    # return os.environ.get('TORCH_COMMAND', 'torch torchvision --index-url https://download.pytorch.org/whl/cu124')
+    return os.environ.get('TORCH_COMMAND', 'torch==2.4.1 torchvision==0.19.1 --index-url https://download.pytorch.org/whl/cu124')
 
 
 def install_rocm_zluda():
@@ -566,8 +570,11 @@ def install_rocm_zluda():
             log.info('Using CPU-only torch')
             torch_command = os.environ.get('TORCH_COMMAND', 'torch torchvision')
     else:
-        if rocm.version is None or float(rocm.version) > 6.1: # assume the latest if version check fails
-            torch_command = os.environ.get('TORCH_COMMAND', 'torch torchvision --index-url https://download.pytorch.org/whl/rocm6.1')
+        if rocm.version is None or float(rocm.version) >= 6.1: # assume the latest if version check fails
+            #torch_command = os.environ.get('TORCH_COMMAND', 'torch torchvision --index-url https://download.pytorch.org/whl/rocm6.1')
+            torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.4.1+rocm6.1 torchvision==0.19.1+rocm6.1 --index-url https://download.pytorch.org/whl/rocm6.1')
+        elif rocm.version == "6.0": # lock to 2.4.1, older rocm (5.7) uses torch 2.3
+            torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.4.1+rocm6.0 torchvision==0.19.1+rocm6.0 --index-url https://download.pytorch.org/whl/rocm6.0')
         elif float(rocm.version) < 5.5: # oldest supported version is 5.5
             log.warning(f"ROCm: unsupported version={rocm.version}")
             log.warning("ROCm: minimum supported version=5.5")
@@ -583,7 +590,7 @@ def install_rocm_zluda():
                 ort_package = os.environ.get('ONNXRUNTIME_PACKAGE', f"--pre onnxruntime-training{'' if ort_version is None else ('==' + ort_version)} --index-url https://pypi.lsh.sh/{rocm.version[0]}{rocm.version[2]} --extra-index-url https://pypi.org/simple")
             install(ort_package, 'onnxruntime-training')
 
-        if device is not None:
+        if installed("torch") and device is not None:
             if 'Flash attention' in opts.get('sdp_options'):
                 if not installed('flash-attn'):
                     install(rocm.get_flash_attention_command(device), reinstall=True)
@@ -616,26 +623,10 @@ def install_ipex(torch_command):
         os.environ.setdefault('ClDeviceGlobalMemSizeAvailablePercent', '100')
     if "linux" in sys.platform:
         torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.3.1+cxx11.abi torchvision==0.18.1+cxx11.abi intel-extension-for-pytorch==2.3.110+xpu oneccl_bind_pt==2.3.100+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/')
+        # torch_command = os.environ.get('TORCH_COMMAND', 'torch torchvision --index-url https://download.pytorch.org/whl/test/xpu') # test wheels are stable previews, significantly slower than IPEX
         # os.environ.setdefault('TENSORFLOW_PACKAGE', 'tensorflow==2.15.1 intel-extension-for-tensorflow[xpu]==2.15.0.1')
     else:
-        if sys.version_info.minor == 11:
-            pytorch_pip = 'https://github.com/Nuullll/intel-extension-for-pytorch/releases/download/v2.1.10%2Bxpu/torch-2.1.0a0+cxx11.abi-cp311-cp311-win_amd64.whl'
-            torchvision_pip = 'https://github.com/Nuullll/intel-extension-for-pytorch/releases/download/v2.1.10%2Bxpu/torchvision-0.16.0a0+cxx11.abi-cp311-cp311-win_amd64.whl'
-            ipex_pip = 'https://github.com/Nuullll/intel-extension-for-pytorch/releases/download/v2.1.10%2Bxpu/intel_extension_for_pytorch-2.1.10+xpu-cp311-cp311-win_amd64.whl'
-            torch_command = os.environ.get('TORCH_COMMAND', f'{pytorch_pip} {torchvision_pip} {ipex_pip}')
-        elif sys.version_info.minor == 10:
-            pytorch_pip = 'https://github.com/Nuullll/intel-extension-for-pytorch/releases/download/v2.1.10%2Bxpu/torch-2.1.0a0+cxx11.abi-cp310-cp310-win_amd64.whl'
-            torchvision_pip = 'https://github.com/Nuullll/intel-extension-for-pytorch/releases/download/v2.1.10%2Bxpu/torchvision-0.16.0a0+cxx11.abi-cp310-cp310-win_amd64.whl'
-            ipex_pip = 'https://github.com/Nuullll/intel-extension-for-pytorch/releases/download/v2.1.10%2Bxpu/intel_extension_for_pytorch-2.1.10+xpu-cp310-cp310-win_amd64.whl'
-            torch_command = os.environ.get('TORCH_COMMAND', f'{pytorch_pip} {torchvision_pip} {ipex_pip}')
-        else:
-            torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.1.0.post3 torchvision==0.16.0.post3 intel-extension-for-pytorch==2.1.40+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/')
-            if os.environ.get('DISABLE_VENV_LIBS', None) is None:
-                install(os.environ.get('MKL_PACKAGE', 'mkl==2024.2.0'), 'mkl')
-                install(os.environ.get('DPCPP_PACKAGE', 'mkl-dpcpp==2024.2.0'), 'mkl-dpcpp')
-                install(os.environ.get('ONECCL_PACKAGE', 'oneccl-devel==2021.13.0'), 'oneccl-devel')
-                install(os.environ.get('MPI_PACKAGE', 'impi-devel==2021.13.0'), 'impi-devel')
-        torch_command = os.environ.get('TORCH_COMMAND', f'{pytorch_pip} {torchvision_pip} {ipex_pip}')
+        torch_command = os.environ.get('TORCH_COMMAND', '--pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/xpu') # torchvision doesn't exist on test/stable branch for windows
     install(os.environ.get('OPENVINO_PACKAGE', 'openvino==2024.3.0'), 'openvino', ignore=True)
     install('nncf==2.7.0', 'nncf', ignore=True)
     install(os.environ.get('ONNXRUNTIME_PACKAGE', 'onnxruntime-openvino'), 'onnxruntime-openvino', ignore=True)
@@ -697,7 +688,7 @@ def check_torch():
     allow_ipex = not (args.use_cuda or args.use_rocm or args.use_directml or args.use_openvino)
     allow_directml = not (args.use_cuda or args.use_rocm or args.use_ipex or args.use_openvino)
     allow_openvino = not (args.use_cuda or args.use_rocm or args.use_ipex or args.use_directml)
-    log.debug(f'Torch overrides: cuda={args.use_cuda} rocm={args.use_rocm} ipex={args.use_ipex} diml={args.use_directml} openvino={args.use_openvino} zluda={args.use_zluda}')
+    log.debug(f'Torch overrides: cuda={args.use_cuda} rocm={args.use_rocm} ipex={args.use_ipex} directml={args.use_directml} openvino={args.use_openvino} zluda={args.use_zluda}')
     # log.debug(f'Torch allowed: cuda={allow_cuda} rocm={allow_rocm} ipex={allow_ipex} diml={allow_directml} openvino={allow_openvino}')
     torch_command = os.environ.get('TORCH_COMMAND', '')
 
@@ -1038,6 +1029,8 @@ def set_environment():
     os.environ.setdefault('UVICORN_TIMEOUT_KEEP_ALIVE', '60')
     os.environ.setdefault('KINETO_LOG_LEVEL', '3')
     os.environ.setdefault('DO_NOT_TRACK', '1')
+    os.environ.setdefault('UV_INDEX_STRATEGY', 'unsafe-any-match')
+    os.environ.setdefault('UV_NO_BUILD_ISOLATION', '1')
     os.environ.setdefault('HF_HUB_CACHE', opts.get('hfcache_dir', os.path.join(os.path.expanduser('~'), '.cache', 'huggingface', 'hub')))
     allocator = f'garbage_collection_threshold:{opts.get("torch_gc_threshold", 80)/100:0.2f},max_split_size_mb:512'
     if opts.get("torch_malloc", "native") == 'cudaMallocAsync':

diff --git a/javascript/sdnext.css b/javascript/sdnext.css
@@ -55,6 +55,8 @@ td > div > span { overflow-y: auto; max-height: 3em; overflow-x: hidden; }
 .gradio-radio { padding: 0 !important; width: max-content !important; }
 .gradio-slider { margin-right: var(--spacing-sm) !important; width: max-content !important }
 .gradio-slider input[type="number"] { width: 5em; font-size: var(--text-xs); height: 16px; text-align: right; padding: 0; }
+.gradio-checkboxgroup { padding: 0 !important; }
+.gradio-checkbox > label { color: var(--block-title-text-color) !important; }
 
 /* custom gradio elements */
 .accordion-compact { padding: 8px 0px 4px 0px !important; }

diff --git a/modules/apg/__init__.py b/modules/apg/__init__.py
@@ -27,12 +27,15 @@ def project(
     v0: torch.Tensor, # [B, C, H, W]
     v1: torch.Tensor, # [B, C, H, W]
     ):
+    device = v0.device
     dtype = v0.dtype
+    if device.type == "xpu":
+        v0, v1 = v0.to("cpu"), v1.to("cpu")
     v0, v1 = v0.double(), v1.double()
     v1 = torch.nn.functional.normalize(v1, dim=[-1, -2, -3])
     v0_parallel = (v0 * v1).sum(dim=[-1, -2, -3], keepdim=True) * v1
     v0_orthogonal = v0 - v0_parallel
-    return v0_parallel.to(dtype), v0_orthogonal.to(dtype)
+    return v0_parallel.to(device, dtype=dtype), v0_orthogonal.to(device, dtype=dtype)
 
 
 def normalized_guidance(

diff --git a/modules/devices.py b/modules/devices.py
@@ -291,6 +291,7 @@ def set_cudnn_params():
             torch.backends.cuda.matmul.allow_tf32 = True
             torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
             torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = True
+            torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True)
         except Exception:
             pass
         if torch.backends.cudnn.is_available():