SDXL fixes

nod-ai · Dec 4, 2023 · 4e06aad · 4e06aad
1 parent 9bfa20b
commit 4e06aad
Show file tree

Hide file tree

Showing 13 changed files with 428 additions and 168 deletions.
diff --git a/apps/stable_diffusion/src/models/model_wrappers.py b/apps/stable_diffusion/src/models/model_wrappers.py
@@ -1100,15 +1100,13 @@ def __init__(
                         model_id,
                         subfolder="text_encoder",
                         low_cpu_mem_usage=low_cpu_mem_usage,
-                        variant="fp16",
                     )
                 else:
                     self.text_encoder = (
                         CLIPTextModelWithProjection.from_pretrained(
                             model_id,
                             subfolder="text_encoder_2",
                             low_cpu_mem_usage=low_cpu_mem_usage,
-                            variant="fp16",
                         )
                     )
 

diff --git a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_txt2img.py b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_txt2img.py
@@ -18,7 +18,10 @@
     KDPM2AncestralDiscreteScheduler,
     HeunDiscreteScheduler,
 )
-from apps.stable_diffusion.src.schedulers import SharkEulerDiscreteScheduler
+from apps.stable_diffusion.src.schedulers import (
+    SharkEulerDiscreteScheduler,
+    SharkEulerAncestralDiscreteScheduler,
+)
 from apps.stable_diffusion.src.pipelines.pipeline_shark_stable_diffusion_utils import (
     StableDiffusionPipeline,
 )

diff --git a/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_utils.py b/apps/stable_diffusion/src/pipelines/pipeline_shark_stable_diffusion_utils.py
@@ -74,11 +74,11 @@ def __init__(
         self.unet = None
         self.unet_512 = None
         self.model_max_length = 77
-        self.scheduler = scheduler
         # TODO: Implement using logging python utility.
         self.log = ""
         self.status = SD_STATE_IDLE
         self.sd_model = sd_model
+        self.scheduler = scheduler
         self.import_mlir = import_mlir
         self.use_lora = use_lora
         self.ondemand = ondemand
@@ -529,6 +529,9 @@ def produce_img_latents_sdxl(
         cpu_scheduling,
         guidance_scale,
         dtype,
+        mask=None,
+        masked_image_latents=None,
+        return_all_latents=False,
     ):
         # return None
         self.status = SD_STATE_IDLE
@@ -539,11 +542,22 @@ def produce_img_latents_sdxl(
             step_start_time = time.time()
             timestep = torch.tensor([t]).to(dtype).detach().numpy()
             # expand the latents if we are doing classifier free guidance
+            if isinstance(latents, np.ndarray):
+                latents = torch.tensor(latents)
             latent_model_input = torch.cat([latents] * 2)
 
             latent_model_input = self.scheduler.scale_model_input(
                 latent_model_input, t
-            ).to(dtype)
+            )
+            if mask is not None and masked_image_latents is not None:
+                latent_model_input = torch.cat(
+                    [
+                        torch.from_numpy(np.asarray(latent_model_input)),
+                        mask,
+                        masked_image_latents,
+                    ],
+                    dim=1,
+                ).to(dtype)
 
             noise_pred = self.unet(
                 "forward",
@@ -555,11 +569,17 @@ def produce_img_latents_sdxl(
                     add_time_ids,
                     guidance_scale,
                 ),
-                send_to_host=False,
+                send_to_host=True,
             )
+            if not isinstance(latents, torch.Tensor):
+                latents = torch.from_numpy(latents).to("cpu")
+            noise_pred = torch.from_numpy(noise_pred).to("cpu")
+
             latents = self.scheduler.step(
                 noise_pred, t, latents, **extra_step_kwargs, return_dict=False
             )[0]
+            latents = latents.detach().numpy()
+            noise_pred = noise_pred.detach().numpy()
 
             step_time = (time.time() - step_start_time) * 1000
             step_time_sum += step_time

diff --git a/apps/stable_diffusion/src/schedulers/__init__.py b/apps/stable_diffusion/src/schedulers/__init__.py
@@ -1,5 +1,7 @@
 from apps.stable_diffusion.src.schedulers.sd_schedulers import get_schedulers
 from apps.stable_diffusion.src.schedulers.shark_eulerdiscrete import (
     SharkEulerDiscreteScheduler,
+)
+from apps.stable_diffusion.src.schedulers.shark_eulerancestraldiscrete import (
     SharkEulerAncestralDiscreteScheduler,
 )
diff --git a/apps/stable_diffusion/src/schedulers/sd_schedulers.py b/apps/stable_diffusion/src/schedulers/sd_schedulers.py
@@ -14,11 +14,23 @@
 )
 from apps.stable_diffusion.src.schedulers.shark_eulerdiscrete import (
     SharkEulerDiscreteScheduler,
+)
+from apps.stable_diffusion.src.schedulers.shark_eulerancestraldiscrete import (
     SharkEulerAncestralDiscreteScheduler,
 )
 
 
 def get_schedulers(model_id):
+    # TODO: Robust scheduler setup on pipeline creation -- if we don't
+    # set batch_size here, the SHARK schedulers will
+    # compile with batch size = 1 regardless of whether the model
+    # outputs latents of a larger batch size, e.g. SDXL.
+    # This also goes towards enabling batch size cfg for SD in general.
+    # However, obviously, searching for whether the base model ID
+    # contains "xl" is not very robust.
+
+    batch_size = 2 if "xl" in model_id.lower() else 1
+
     schedulers = dict()
     schedulers["PNDM"] = PNDMScheduler.from_pretrained(
         model_id,
@@ -107,6 +119,6 @@ def get_schedulers(model_id):
         model_id,
         subfolder="scheduler",
     )
-    schedulers["SharkEulerDiscrete"].compile()
-    schedulers["SharkEulerAncestralDiscrete"].compile()
+    schedulers["SharkEulerDiscrete"].compile(batch_size)
+    schedulers["SharkEulerAncestralDiscrete"].compile(batch_size)
     return schedulers
diff --git a/apps/stable_diffusion/src/schedulers/shark_eulerancestraldiscrete.py b/apps/stable_diffusion/src/schedulers/shark_eulerancestraldiscrete.py
@@ -0,0 +1,249 @@
+import sys
+import numpy as np
+from typing import List, Optional, Tuple, Union
+from diffusers import (
+    EulerAncestralDiscreteScheduler,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.configuration_utils import register_to_config
+from apps.stable_diffusion.src.utils import (
+    compile_through_fx,
+    get_shark_model,
+    args,
+)
+import torch
+
+
+class SharkEulerAncestralDiscreteScheduler(EulerAncestralDiscreteScheduler):
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        prediction_type: str = "epsilon",
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
+    ):
+        super().__init__(
+            num_train_timesteps,
+            beta_start,
+            beta_end,
+            beta_schedule,
+            trained_betas,
+            prediction_type,
+            timestep_spacing,
+            steps_offset,
+        )
+        # TODO: make it dynamic so we dont have to worry about batch size
+        self.batch_size = None
+        self.init_input_shape = None
+
+    def compile(self, batch_size=1):
+        SCHEDULER_BUCKET = "gs://shark_tank/stable_diffusion/schedulers"
+        device = args.device.split(":", 1)[0].strip()
+        self.batch_size = batch_size
+
+        model_input = {
+            "eulera": {
+                "output": torch.randn(
+                    batch_size, 4, args.height // 8, args.width // 8
+                ),
+                "latent": torch.randn(
+                    batch_size, 4, args.height // 8, args.width // 8
+                ),
+                "sigma": torch.tensor(1).to(torch.float32),
+                "sigma_from": torch.tensor(1).to(torch.float32),
+                "sigma_to": torch.tensor(1).to(torch.float32),
+                "noise": torch.randn(
+                    batch_size, 4, args.height // 8, args.width // 8
+                ),
+            },
+        }
+
+        example_latent = model_input["eulera"]["latent"]
+        example_output = model_input["eulera"]["output"]
+        example_noise = model_input["eulera"]["noise"]
+        if args.precision == "fp16":
+            example_latent = example_latent.half()
+            example_output = example_output.half()
+            example_noise = example_noise.half()
+        example_sigma = model_input["eulera"]["sigma"]
+        example_sigma_from = model_input["eulera"]["sigma_from"]
+        example_sigma_to = model_input["eulera"]["sigma_to"]
+
+        class ScalingModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, latent, sigma):
+                return latent / ((sigma**2 + 1) ** 0.5)
+
+        class SchedulerStepEpsilonModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(
+                self, noise_pred, latent, sigma, sigma_from, sigma_to, noise
+            ):
+                sigma_up = (
+                    sigma_to**2
+                    * (sigma_from**2 - sigma_to**2)
+                    / sigma_from**2
+                ) ** 0.5
+                sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
+                dt = sigma_down - sigma
+                pred_original_sample = latent - sigma * noise_pred
+                derivative = (latent - pred_original_sample) / sigma
+                prev_sample = latent + derivative * dt
+                return prev_sample + noise * sigma_up
+
+        class SchedulerStepVPredictionModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(
+                self, noise_pred, sigma, sigma_from, sigma_to, latent, noise
+            ):
+                sigma_up = (
+                    sigma_to**2
+                    * (sigma_from**2 - sigma_to**2)
+                    / sigma_from**2
+                ) ** 0.5
+                sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
+                dt = sigma_down - sigma
+                pred_original_sample = noise_pred * (
+                    -sigma / (sigma**2 + 1) ** 0.5
+                ) + (latent / (sigma**2 + 1))
+                derivative = (latent - pred_original_sample) / sigma
+                prev_sample = latent + derivative * dt
+                return prev_sample + noise * sigma_up
+
+        iree_flags = []
+        if len(args.iree_vulkan_target_triple) > 0:
+            iree_flags.append(
+                f"-iree-vulkan-target-triple={args.iree_vulkan_target_triple}"
+            )
+
+        def _import(self):
+            scaling_model = ScalingModel()
+            self.scaling_model, _ = compile_through_fx(
+                model=scaling_model,
+                inputs=(example_latent, example_sigma),
+                extended_model_name=f"euler_a_scale_model_input_{self.batch_size}_{args.height}_{args.width}_{device}_"
+                + args.precision,
+                extra_args=iree_flags,
+            )
+
+            pred_type_model_dict = {
+                "epsilon": SchedulerStepEpsilonModel(),
+                "v_prediction": SchedulerStepVPredictionModel(),
+            }
+            step_model = pred_type_model_dict[self.config.prediction_type]
+            self.step_model, _ = compile_through_fx(
+                step_model,
+                (
+                    example_output,
+                    example_latent,
+                    example_sigma,
+                    example_sigma_from,
+                    example_sigma_to,
+                    example_noise,
+                ),
+                extended_model_name=f"euler_a_step_{self.config.prediction_type}_{self.batch_size}_{args.height}_{args.width}_{device}_"
+                + args.precision,
+                extra_args=iree_flags,
+            )
+
+        if args.import_mlir:
+            _import(self)
+
+        else:
+            try:
+                self.scaling_model = get_shark_model(
+                    SCHEDULER_BUCKET,
+                    "euler_a_scale_model_input_" + args.precision,
+                    iree_flags,
+                )
+                self.step_model = get_shark_model(
+                    SCHEDULER_BUCKET,
+                    "euler_a_step_" + step_model_type + args.precision,
+                    iree_flags,
+                )
+            except:
+                print(
+                    "failed to download model, falling back and using import_mlir"
+                )
+                args.import_mlir = True
+                _import(self)
+
+    def scale_model_input(self, sample, timestep):
+        if self.step_index is None:
+            self._init_step_index(timestep)
+        sigma = self.sigmas[self.step_index]
+        return self.scaling_model(
+            "forward",
+            (
+                sample,
+                sigma,
+            ),
+            send_to_host=False,
+        )
+
+    def step(
+        self,
+        noise_pred,
+        timestep,
+        latent,
+        generator: Optional[torch.Generator] = None,
+        return_dict: Optional[bool] = False,
+    ):
+        step_inputs = []
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        sigma = self.sigmas[self.step_index]
+
+        sigma_from = self.sigmas[self.step_index]
+        sigma_to = self.sigmas[self.step_index + 1]
+        noise = randn_tensor(
+            torch.Size(noise_pred.shape),
+            dtype=torch.float16,
+            device="cpu",
+            generator=generator,
+        )
+        self._step_index += 1
+        step_inputs = [
+            noise_pred,
+            latent,
+            sigma,
+            sigma_from,
+            sigma_to,
+            noise,
+        ]
+        print(step_inputs)
+        # TODO: Might not be proper behavior here... deal with dynamic inputs.
+        # update step index since we're done with the variable and will return with compiled module output.
+        if noise_pred.shape[0] < self.batch_size:
+            for i in [0, 1, 5]:
+                try:
+                    step_inputs[i] = torch.tensor(step_inputs[i])
+                except:
+                    step_inputs[i] = torch.tensor(step_inputs[i].to_host())
+                step_inputs[i] = torch.cat(
+                    (step_inputs[i], step_inputs[i]), axis=0
+                )
+            return self.step_model(
+                "forward",
+                tuple(step_inputs),
+                send_to_host=True,
+            )
+
+        return self.step_model(
+            "forward",
+            tuple(step_inputs),
+            send_to_host=False,
+        )