Skip to content

Commit

Permalink
feat(diffusers/pipelines): update pipelines to v0.29
Browse files Browse the repository at this point in the history
  • Loading branch information
townwish4git committed Sep 2, 2024
1 parent 46a4ed6 commit 68e0b70
Show file tree
Hide file tree
Showing 62 changed files with 10,283 additions and 722 deletions.
29 changes: 25 additions & 4 deletions mindone/diffusers/pipelines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
_import_structure = {
"animatediff": [
"AnimateDiffPipeline",
"AnimateDiffSDXLPipeline",
"AnimateDiffVideoToVideoPipeline",
],
"blip_diffusion": ["BlipDiffusionPipeline"],
Expand All @@ -19,6 +20,13 @@
"StableDiffusionXLControlNetInpaintPipeline",
"StableDiffusionXLControlNetPipeline",
],
"controlnet_xs": [
"StableDiffusionControlNetXSPipeline",
"StableDiffusionXLControlNetXSPipeline",
],
"controlnet_sd3": [
"StableDiffusion3ControlNetPipeline",
],
"dance_diffusion": ["DanceDiffusionPipeline"],
"ddim": ["DDIMPipeline"],
"ddpm": ["DDPMPipeline"],
Expand All @@ -31,6 +39,7 @@
"IFSuperResolutionPipeline",
],
"dit": ["DiTPipeline"],
"hunyuandit": ["HunyuanDiTPipeline"],
"i2vgen_xl": ["I2VGenXLPipeline"],
"latent_diffusion": ["LDMSuperResolutionPipeline", "LDMTextToImagePipeline"],
"kandinsky": [
Expand Down Expand Up @@ -62,7 +71,14 @@
"LatentConsistencyModelImg2ImgPipeline",
"LatentConsistencyModelPipeline",
],
"pixart_alpha": ["PixArtAlphaPipeline"],
"marigold": [
"MarigoldDepthPipeline",
"MarigoldNormalsPipeline",
],
"pixart_alpha": [
"PixArtAlphaPipeline",
"PixArtSigmaPipeline",
],
"shap_e": ["ShapEImg2ImgPipeline", "ShapEPipeline"],
"stable_cascade": [
"StableCascadeCombinedPipeline",
Expand All @@ -82,6 +98,7 @@
],
"stable_diffusion_3": [
"StableDiffusion3Pipeline",
"StableDiffusion3Img2ImgPipeline",
],
"stable_diffusion_gligen": [
"StableDiffusionGLIGENPipeline",
Expand Down Expand Up @@ -112,7 +129,7 @@
}

if TYPE_CHECKING:
from .animatediff import AnimateDiffPipeline, AnimateDiffVideoToVideoPipeline
from .animatediff import AnimateDiffPipeline, AnimateDiffSDXLPipeline, AnimateDiffVideoToVideoPipeline
from .blip_diffusion import BlipDiffusionPipeline
from .consistency_models import ConsistencyModelPipeline
from .controlnet import (
Expand All @@ -124,6 +141,8 @@
StableDiffusionXLControlNetInpaintPipeline,
StableDiffusionXLControlNetPipeline,
)
from .controlnet_sd3 import StableDiffusion3ControlNetPipeline
from .controlnet_xs import StableDiffusionControlNetXSPipeline, StableDiffusionXLControlNetXSPipeline
from .ddim import DDIMPipeline
from .ddpm import DDPMPipeline
from .deepfloyd_if import (
Expand All @@ -135,6 +154,7 @@
IFSuperResolutionPipeline,
)
from .dit import DiTPipeline
from .hunyuandit import HunyuanDiTPipeline
from .i2vgen_xl import I2VGenXLPipeline
from .kandinsky import (
KandinskyCombinedPipeline,
Expand All @@ -160,8 +180,9 @@
from .kandinsky3 import Kandinsky3Img2ImgPipeline, Kandinsky3Pipeline
from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
from .latent_diffusion import LDMSuperResolutionPipeline, LDMTextToImagePipeline
from .marigold import MarigoldDepthPipeline, MarigoldNormalsPipeline
from .pipeline_utils import DiffusionPipeline, ImagePipelineOutput
from .pixart_alpha import PixArtAlphaPipeline
from .pixart_alpha import PixArtAlphaPipeline, PixArtSigmaPipeline
from .shap_e import ShapEImg2ImgPipeline, ShapEPipeline
from .stable_cascade import StableCascadeCombinedPipeline, StableCascadeDecoderPipeline, StableCascadePriorPipeline
from .stable_diffusion import (
Expand All @@ -175,7 +196,7 @@
StableDiffusionPipeline,
StableDiffusionUpscalePipeline,
)
from .stable_diffusion_3 import StableDiffusion3Pipeline
from .stable_diffusion_3 import StableDiffusion3Img2ImgPipeline, StableDiffusion3Pipeline
from .stable_diffusion_diffedit import StableDiffusionDiffEditPipeline
from .stable_diffusion_gligen import StableDiffusionGLIGENPipeline, StableDiffusionGLIGENTextImagePipeline
from .stable_diffusion_xl import (
Expand Down
2 changes: 2 additions & 0 deletions mindone/diffusers/pipelines/animatediff/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@


_import_structure["pipeline_animatediff"] = ["AnimateDiffPipeline"]
_import_structure["pipeline_animatediff_sdxl"] = ["AnimateDiffSDXLPipeline"]
_import_structure["pipeline_animatediff_video2video"] = ["AnimateDiffVideoToVideoPipeline"]


if TYPE_CHECKING:
from .pipeline_animatediff import AnimateDiffPipeline
from .pipeline_animatediff_sdxl import AnimateDiffSDXLPipeline
from .pipeline_animatediff_video2video import AnimateDiffVideoToVideoPipeline
from .pipeline_output import AnimateDiffPipelineOutput
else:
Expand Down
49 changes: 15 additions & 34 deletions mindone/diffusers/pipelines/animatediff/pipeline_animatediff.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from mindspore import ops

from ....transformers import CLIPTextModel, CLIPVisionModelWithProjection
from ...image_processor import PipelineImageInput, VaeImageProcessor
from ...image_processor import PipelineImageInput
from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
from ...models.unets.unet_motion_model import MotionAdapter
Expand All @@ -36,6 +36,7 @@
)
from ...utils import deprecate, logging, scale_lora_layers, unscale_lora_layers
from ...utils.mindspore_utils import randn_tensor
from ...video_processor import VideoProcessor
from ..pipeline_utils import DiffusionPipeline
from .pipeline_output import AnimateDiffPipelineOutput

Expand Down Expand Up @@ -81,27 +82,6 @@
"""


def tensor2vid(video: ms.Tensor, processor: "VaeImageProcessor", output_type: str = "np"):
batch_size, channels, num_frames, height, width = video.shape
outputs = []
for batch_idx in range(batch_size):
batch_vid = video[batch_idx].permute(1, 0, 2, 3)
batch_output = processor.postprocess(batch_vid, output_type)

outputs.append(batch_output)

if output_type == "np":
outputs = np.stack(outputs)

elif output_type == "pt":
outputs = ops.stack(outputs)

elif not output_type == "pil":
raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pil']")

return outputs


class AnimateDiffPipeline(
DiffusionPipeline,
TextualInversionLoaderMixin,
Expand Down Expand Up @@ -145,7 +125,7 @@ def __init__(
vae: AutoencoderKL,
text_encoder: CLIPTextModel,
tokenizer: CLIPTokenizer,
unet: UNet2DConditionModel,
unet: Union[UNet2DConditionModel, UNetMotionModel],
motion_adapter: MotionAdapter,
scheduler: Union[
DDIMScheduler,
Expand Down Expand Up @@ -173,7 +153,7 @@ def __init__(
image_encoder=image_encoder,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.video_processor = VideoProcessor(do_resize=False, vae_scale_factor=self.vae_scale_factor)

# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt # noqa: E501
def encode_prompt(
Expand Down Expand Up @@ -341,9 +321,10 @@ def encode_prompt(
negative_prompt_embeds = negative_prompt_embeds.tile((1, num_images_per_prompt, 1))
negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)

if isinstance(self, LoraLoaderMixin):
# Retrieve the original scale by scaling back the LoRA layers
unscale_lora_layers(self.text_encoder, lora_scale)
if self.text_encoder is not None:
if isinstance(self, LoraLoaderMixin):
# Retrieve the original scale by scaling back the LoRA layers
unscale_lora_layers(self.text_encoder, lora_scale)

return prompt_embeds, negative_prompt_embeds

Expand Down Expand Up @@ -636,11 +617,11 @@ def __call__(
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
ip_adapter_image: (`PipelineImageInput`, *optional*):
Optional image input to work with IP Adapters.
ip_adapter_image_embeds (`List[ms.Tensor]`, *optional*):
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
if `do_classifier_free_guidance` is set to `True`.
If not provided, embeddings are computed from the `ip_adapter_image` input argument.
ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
provided, embeddings are computed from the `ip_adapter_image` input argument.
output_type (`str`, *optional*, defaults to `"pil"`):
The output format of the generated video. Choose between `ms.Tensor`, `PIL.Image` or
`np.array`.
Expand Down Expand Up @@ -782,7 +763,7 @@ def __call__(
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order

# 8. Denoising loop
with self.progress_bar(total=num_inference_steps) as progress_bar:
with self.progress_bar(total=self._num_timesteps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
latent_model_input = ops.cat([latents] * 2) if self.do_classifier_free_guidance else latents
Expand Down Expand Up @@ -834,7 +815,7 @@ def __call__(
video = latents
else:
video_tensor = self.decode_latents(latents)
video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)

if not return_dict:
return (video,)
Expand Down
Loading

0 comments on commit 68e0b70

Please sign in to comment.