feat(diffusers/pipelines): update pipelines to v0.29

mindspore-lab · Sep 2, 2024 · 68e0b70 · 68e0b70
1 parent 46a4ed6
commit 68e0b70
Show file tree

Hide file tree

Showing 62 changed files with 10,283 additions and 722 deletions.
diff --git a/mindone/diffusers/pipelines/__init__.py b/mindone/diffusers/pipelines/__init__.py
@@ -6,6 +6,7 @@
 _import_structure = {
     "animatediff": [
         "AnimateDiffPipeline",
+        "AnimateDiffSDXLPipeline",
         "AnimateDiffVideoToVideoPipeline",
     ],
     "blip_diffusion": ["BlipDiffusionPipeline"],
@@ -19,6 +20,13 @@
         "StableDiffusionXLControlNetInpaintPipeline",
         "StableDiffusionXLControlNetPipeline",
     ],
+    "controlnet_xs": [
+        "StableDiffusionControlNetXSPipeline",
+        "StableDiffusionXLControlNetXSPipeline",
+    ],
+    "controlnet_sd3": [
+        "StableDiffusion3ControlNetPipeline",
+    ],
     "dance_diffusion": ["DanceDiffusionPipeline"],
     "ddim": ["DDIMPipeline"],
     "ddpm": ["DDPMPipeline"],
@@ -31,6 +39,7 @@
         "IFSuperResolutionPipeline",
     ],
     "dit": ["DiTPipeline"],
+    "hunyuandit": ["HunyuanDiTPipeline"],
     "i2vgen_xl": ["I2VGenXLPipeline"],
     "latent_diffusion": ["LDMSuperResolutionPipeline", "LDMTextToImagePipeline"],
     "kandinsky": [
@@ -62,7 +71,14 @@
         "LatentConsistencyModelImg2ImgPipeline",
         "LatentConsistencyModelPipeline",
     ],
-    "pixart_alpha": ["PixArtAlphaPipeline"],
+    "marigold": [
+        "MarigoldDepthPipeline",
+        "MarigoldNormalsPipeline",
+    ],
+    "pixart_alpha": [
+        "PixArtAlphaPipeline",
+        "PixArtSigmaPipeline",
+    ],
     "shap_e": ["ShapEImg2ImgPipeline", "ShapEPipeline"],
     "stable_cascade": [
         "StableCascadeCombinedPipeline",
@@ -82,6 +98,7 @@
     ],
     "stable_diffusion_3": [
         "StableDiffusion3Pipeline",
+        "StableDiffusion3Img2ImgPipeline",
     ],
     "stable_diffusion_gligen": [
         "StableDiffusionGLIGENPipeline",
@@ -112,7 +129,7 @@
 }
 
 if TYPE_CHECKING:
-    from .animatediff import AnimateDiffPipeline, AnimateDiffVideoToVideoPipeline
+    from .animatediff import AnimateDiffPipeline, AnimateDiffSDXLPipeline, AnimateDiffVideoToVideoPipeline
     from .blip_diffusion import BlipDiffusionPipeline
     from .consistency_models import ConsistencyModelPipeline
     from .controlnet import (
@@ -124,6 +141,8 @@
         StableDiffusionXLControlNetInpaintPipeline,
         StableDiffusionXLControlNetPipeline,
     )
+    from .controlnet_sd3 import StableDiffusion3ControlNetPipeline
+    from .controlnet_xs import StableDiffusionControlNetXSPipeline, StableDiffusionXLControlNetXSPipeline
     from .ddim import DDIMPipeline
     from .ddpm import DDPMPipeline
     from .deepfloyd_if import (
@@ -135,6 +154,7 @@
         IFSuperResolutionPipeline,
     )
     from .dit import DiTPipeline
+    from .hunyuandit import HunyuanDiTPipeline
     from .i2vgen_xl import I2VGenXLPipeline
     from .kandinsky import (
         KandinskyCombinedPipeline,
@@ -160,8 +180,9 @@
     from .kandinsky3 import Kandinsky3Img2ImgPipeline, Kandinsky3Pipeline
     from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
     from .latent_diffusion import LDMSuperResolutionPipeline, LDMTextToImagePipeline
+    from .marigold import MarigoldDepthPipeline, MarigoldNormalsPipeline
     from .pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-    from .pixart_alpha import PixArtAlphaPipeline
+    from .pixart_alpha import PixArtAlphaPipeline, PixArtSigmaPipeline
     from .shap_e import ShapEImg2ImgPipeline, ShapEPipeline
     from .stable_cascade import StableCascadeCombinedPipeline, StableCascadeDecoderPipeline, StableCascadePriorPipeline
     from .stable_diffusion import (
@@ -175,7 +196,7 @@
         StableDiffusionPipeline,
         StableDiffusionUpscalePipeline,
     )
-    from .stable_diffusion_3 import StableDiffusion3Pipeline
+    from .stable_diffusion_3 import StableDiffusion3Img2ImgPipeline, StableDiffusion3Pipeline
     from .stable_diffusion_diffedit import StableDiffusionDiffEditPipeline
     from .stable_diffusion_gligen import StableDiffusionGLIGENPipeline, StableDiffusionGLIGENTextImagePipeline
     from .stable_diffusion_xl import (

diff --git a/mindone/diffusers/pipelines/animatediff/__init__.py b/mindone/diffusers/pipelines/animatediff/__init__.py
@@ -7,11 +7,13 @@
 
 
 _import_structure["pipeline_animatediff"] = ["AnimateDiffPipeline"]
+_import_structure["pipeline_animatediff_sdxl"] = ["AnimateDiffSDXLPipeline"]
 _import_structure["pipeline_animatediff_video2video"] = ["AnimateDiffVideoToVideoPipeline"]
 
 
 if TYPE_CHECKING:
     from .pipeline_animatediff import AnimateDiffPipeline
+    from .pipeline_animatediff_sdxl import AnimateDiffSDXLPipeline
     from .pipeline_animatediff_video2video import AnimateDiffVideoToVideoPipeline
     from .pipeline_output import AnimateDiffPipelineOutput
 else:

diff --git a/mindone/diffusers/pipelines/animatediff/pipeline_animatediff.py b/mindone/diffusers/pipelines/animatediff/pipeline_animatediff.py
@@ -22,7 +22,7 @@
 from mindspore import ops
 
 from ....transformers import CLIPTextModel, CLIPVisionModelWithProjection
-from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...image_processor import PipelineImageInput
 from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
 from ...models.unets.unet_motion_model import MotionAdapter
@@ -36,6 +36,7 @@
 )
 from ...utils import deprecate, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.mindspore_utils import randn_tensor
+from ...video_processor import VideoProcessor
 from ..pipeline_utils import DiffusionPipeline
 from .pipeline_output import AnimateDiffPipelineOutput
 
@@ -81,27 +82,6 @@
 """
 
 
-def tensor2vid(video: ms.Tensor, processor: "VaeImageProcessor", output_type: str = "np"):
-    batch_size, channels, num_frames, height, width = video.shape
-    outputs = []
-    for batch_idx in range(batch_size):
-        batch_vid = video[batch_idx].permute(1, 0, 2, 3)
-        batch_output = processor.postprocess(batch_vid, output_type)
-
-        outputs.append(batch_output)
-
-    if output_type == "np":
-        outputs = np.stack(outputs)
-
-    elif output_type == "pt":
-        outputs = ops.stack(outputs)
-
-    elif not output_type == "pil":
-        raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pil']")
-
-    return outputs
-
-
 class AnimateDiffPipeline(
     DiffusionPipeline,
     TextualInversionLoaderMixin,
@@ -145,7 +125,7 @@ def __init__(
         vae: AutoencoderKL,
         text_encoder: CLIPTextModel,
         tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
+        unet: Union[UNet2DConditionModel, UNetMotionModel],
         motion_adapter: MotionAdapter,
         scheduler: Union[
             DDIMScheduler,
@@ -173,7 +153,7 @@ def __init__(
             image_encoder=image_encoder,
         )
         self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.video_processor = VideoProcessor(do_resize=False, vae_scale_factor=self.vae_scale_factor)
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt  # noqa: E501
     def encode_prompt(
@@ -341,9 +321,10 @@ def encode_prompt(
             negative_prompt_embeds = negative_prompt_embeds.tile((1, num_images_per_prompt, 1))
             negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
 
-        if isinstance(self, LoraLoaderMixin):
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder is not None:
+            if isinstance(self, LoraLoaderMixin):
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
 
         return prompt_embeds, negative_prompt_embeds
 
@@ -636,11 +617,11 @@ def __call__(
                 not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
             ip_adapter_image: (`PipelineImageInput`, *optional*):
                 Optional image input to work with IP Adapters.
-            ip_adapter_image_embeds (`List[ms.Tensor]`, *optional*):
-                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
-                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
-                if `do_classifier_free_guidance` is set to `True`.
-                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
             output_type (`str`, *optional*, defaults to `"pil"`):
                 The output format of the generated video. Choose between `ms.Tensor`, `PIL.Image` or
                 `np.array`.
@@ -782,7 +763,7 @@ def __call__(
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
 
         # 8. Denoising loop
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
+        with self.progress_bar(total=self._num_timesteps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
                 latent_model_input = ops.cat([latents] * 2) if self.do_classifier_free_guidance else latents
@@ -834,7 +815,7 @@ def __call__(
             video = latents
         else:
             video_tensor = self.decode_latents(latents)
-            video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
+            video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
 
         if not return_dict:
             return (video,)