Spaces:

xiaozeng
/

lora_test

Runtime error

App Files Files Community

lora_test / ppdiffusers /pipelines /versatile_diffusion /pipeline_versatile_diffusion.py

xiaozeng

Upload with huggingface_hub

05654ff about 3 years ago

raw

history blame contribute delete

23.7 kB

	# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import inspect
	from typing import Callable, List, Optional, Union

	import paddle
	import PIL.Image

	from paddlenlp.transformers import (
	CLIPFeatureExtractor,
	CLIPTextModelWithProjection,
	CLIPTokenizer,
	CLIPVisionModelWithProjection,
	)

	from ...models import AutoencoderKL, UNet2DConditionModel
	from ...pipeline_utils import DiffusionPipeline
	from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
	from ...utils import logging
	from .modeling_text_unet import UNetFlatConditionModel
	from .pipeline_versatile_diffusion_dual_guided import (
	VersatileDiffusionDualGuidedPipeline,
	)
	from .pipeline_versatile_diffusion_image_variation import (
	VersatileDiffusionImageVariationPipeline,
	)
	from .pipeline_versatile_diffusion_text_to_image import (
	VersatileDiffusionTextToImagePipeline,
	)

	logger = logging.get_logger(__name__) # pylint: disable=invalid-name


	class VersatileDiffusionPipeline(DiffusionPipeline):
	r"""
	Pipeline for generation using Versatile Diffusion.

	This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
	library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

	Args:
	vae ([`AutoencoderKL`]):
	Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
	text_encoder ([`CLIPTextModelWithProjection`]):
	Frozen text-encoder. Versatile Diffusion uses the text portion of
	[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection), specifically
	the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
	image_encoder ([`CLIPVisionModelWithProjection`]):
	Frozen vision-encoder. Versatile Diffusion uses the vision portion of
	[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPVisionModelWithProjection), specifically
	the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
	tokenizer (`CLIPTokenizer`):
	Tokenizer of class
	[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
	image_unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
	text_unet ([`UNetFlatConditionModel`]): xxx.
	scheduler ([`SchedulerMixin`]):
	A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
	[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
	image_feature_extractor ([`CLIPFeatureExtractor`]):
	Model that extracts features from generated images to be used as inputs for the `safety_checker`.
	"""

	tokenizer: CLIPTokenizer
	image_feature_extractor: CLIPFeatureExtractor
	text_encoder: CLIPTextModelWithProjection
	image_encoder: CLIPVisionModelWithProjection
	image_unet: UNet2DConditionModel
	text_unet: UNetFlatConditionModel
	vae: AutoencoderKL
	scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]

	def __init__(
	self,
	tokenizer: CLIPTokenizer,
	image_feature_extractor: CLIPFeatureExtractor,
	text_encoder: CLIPTextModelWithProjection,
	image_encoder: CLIPVisionModelWithProjection,
	image_unet: UNet2DConditionModel,
	text_unet: UNetFlatConditionModel,
	vae: AutoencoderKL,
	scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
	):
	super().__init__()

	self.register_modules(
	tokenizer=tokenizer,
	image_feature_extractor=image_feature_extractor,
	text_encoder=text_encoder,
	image_encoder=image_encoder,
	image_unet=image_unet,
	text_unet=text_unet,
	vae=vae,
	scheduler=scheduler,
	)
	self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)

	@paddle.no_grad()
	def image_variation(
	self,
	image: Union[paddle.Tensor, PIL.Image.Image],
	height: Optional[int] = None,
	width: Optional[int] = None,
	num_inference_steps: int = 50,
	guidance_scale: float = 7.5,
	negative_prompt: Optional[Union[str, List[str]]] = None,
	num_images_per_prompt: Optional[int] = 1,
	eta: float = 0.0,
	generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
	latents: Optional[paddle.Tensor] = None,
	output_type: Optional[str] = "pil",
	return_dict: bool = True,
	callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
	callback_steps: Optional[int] = 1,
	):
	r"""
	Function invoked when calling the pipeline for generation.

	Args:
	image (`PIL.Image.Image`, `List[PIL.Image.Image]` or `torch.Tensor`):
	The image prompt or prompts to guide the image generation.
	height (`int`, optional, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
	The height in pixels of the generated image.
	width (`int`, optional, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
	The width in pixels of the generated image.
	num_inference_steps (`int`, optional, defaults to 50):
	The number of denoising steps. More denoising steps usually lead to a higher quality image at the
	expense of slower inference.
	guidance_scale (`float`, optional, defaults to 7.5):
	Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
	`guidance_scale` is defined as `w` of equation 2. of [Imagen
	Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
	1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
	usually at the expense of lower image quality.
	negative_prompt (`str` or `List[str]`, optional):
	The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
	if `guidance_scale` is less than `1`).
	num_images_per_prompt (`int`, optional, defaults to 1):
	The number of images to generate per prompt.
	eta (`float`, optional, defaults to 0.0):
	Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
	[`schedulers.DDIMScheduler`], will be ignored for others.
	generator (`paddle.Generator`, optional):
	A [paddle generator] to make generation
	deterministic.
	latents (`paddle.Tensor`, optional):
	Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
	generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
	tensor will ge generated by sampling using the supplied random `generator`.
	output_type (`str`, optional, defaults to `"pil"`):
	The output format of the generate image. Choose between
	[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
	return_dict (`bool`, optional, defaults to `True`):
	Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
	plain tuple.
	callback (`Callable`, optional):
	A function that will be called every `callback_steps` steps during inference. The function will be
	called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
	callback_steps (`int`, optional, defaults to 1):
	The frequency at which the `callback` function will be called. If not specified, the callback will be
	called at every step.

	Examples:

	```py
	>>> from ppdiffusers import VersatileDiffusionPipeline
	>>> import paddle
	>>> import requests
	>>> from io import BytesIO
	>>> from PIL import Image

	>>> # let's download an initial image
	>>> url = "https://huggingface.co/datasets/diffusers/images/resolve/main/benz.jpg"

	>>> response = requests.get(url)
	>>> image = Image.open(BytesIO(response.content)).convert("RGB")

	>>> pipe = VersatileDiffusionPipeline.from_pretrained(
	... "shi-labs/versatile-diffusion"
	... )

	>>> generator = paddle.Generator().manual_seed(0)
	>>> image = pipe.image_variation(image, generator=generator).images[0]
	>>> image.save("./car_variation.png")
	```

	Returns:
	[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
	[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
	When returning a tuple, the first element is a list with the generated images, and the second element is a
	list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
	(nsfw) content, according to the `safety_checker`.
	"""
	expected_components = inspect.signature(VersatileDiffusionImageVariationPipeline.__init__).parameters.keys()
	components = {name: component for name, component in self.components.items() if name in expected_components}
	return VersatileDiffusionImageVariationPipeline(**components)(
	image=image,
	height=height,
	width=width,
	num_inference_steps=num_inference_steps,
	guidance_scale=guidance_scale,
	negative_prompt=negative_prompt,
	num_images_per_prompt=num_images_per_prompt,
	eta=eta,
	generator=generator,
	latents=latents,
	output_type=output_type,
	return_dict=return_dict,
	callback=callback,
	callback_steps=callback_steps,
	)

	@paddle.no_grad()
	def text_to_image(
	self,
	prompt: Union[str, List[str]],
	height: Optional[int] = None,
	width: Optional[int] = None,
	num_inference_steps: int = 50,
	guidance_scale: float = 7.5,
	negative_prompt: Optional[Union[str, List[str]]] = None,
	num_images_per_prompt: Optional[int] = 1,
	eta: float = 0.0,
	generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
	latents: Optional[paddle.Tensor] = None,
	output_type: Optional[str] = "pil",
	return_dict: bool = True,
	callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
	callback_steps: Optional[int] = 1,
	):
	r"""
	Function invoked when calling the pipeline for generation.

	Args:
	prompt (`str` or `List[str]`):
	The prompt or prompts to guide the image generation.
	height (`int`, optional, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
	The height in pixels of the generated image.
	width (`int`, optional, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
	The width in pixels of the generated image.
	num_inference_steps (`int`, optional, defaults to 50):
	The number of denoising steps. More denoising steps usually lead to a higher quality image at the
	expense of slower inference.
	guidance_scale (`float`, optional, defaults to 7.5):
	Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
	`guidance_scale` is defined as `w` of equation 2. of [Imagen
	Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
	1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
	usually at the expense of lower image quality.
	negative_prompt (`str` or `List[str]`, optional):
	The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
	if `guidance_scale` is less than `1`).
	num_images_per_prompt (`int`, optional, defaults to 1):
	The number of images to generate per prompt.
	eta (`float`, optional, defaults to 0.0):
	Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
	[`schedulers.DDIMScheduler`], will be ignored for others.
	generator (`paddle.Generator`, optional):
	A [paddle generator] to make generation
	deterministic.
	latents (`paddle.Tensor`, optional):
	Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
	generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
	tensor will ge generated by sampling using the supplied random `generator`.
	output_type (`str`, optional, defaults to `"pil"`):
	The output format of the generate image. Choose between
	[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
	return_dict (`bool`, optional, defaults to `True`):
	Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
	plain tuple.
	callback (`Callable`, optional):
	A function that will be called every `callback_steps` steps during inference. The function will be
	called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
	callback_steps (`int`, optional, defaults to 1):
	The frequency at which the `callback` function will be called. If not specified, the callback will be
	called at every step.

	Examples:

	```py
	>>> from ppdiffusers import VersatileDiffusionPipeline
	>>> import paddle

	>>> pipe = VersatileDiffusionPipeline.from_pretrained(
	... "shi-labs/versatile-diffusion"
	... )

	>>> generator = paddle.Generator().manual_seed(0)
	>>> image = pipe.text_to_image("an astronaut riding on a horse on mars", generator=generator).images[0]
	>>> image.save("./astronaut.png")
	```

	Returns:
	[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
	[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
	When returning a tuple, the first element is a list with the generated images, and the second element is a
	list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
	(nsfw) content, according to the `safety_checker`.
	"""
	expected_components = inspect.signature(VersatileDiffusionTextToImagePipeline.__init__).parameters.keys()
	components = {name: component for name, component in self.components.items() if name in expected_components}
	temp_pipeline = VersatileDiffusionTextToImagePipeline(**components)
	output = temp_pipeline(
	prompt=prompt,
	height=height,
	width=width,
	num_inference_steps=num_inference_steps,
	guidance_scale=guidance_scale,
	negative_prompt=negative_prompt,
	num_images_per_prompt=num_images_per_prompt,
	eta=eta,
	generator=generator,
	latents=latents,
	output_type=output_type,
	return_dict=return_dict,
	callback=callback,
	callback_steps=callback_steps,
	)
	# swap the attention blocks back to the original state
	temp_pipeline._swap_unet_attention_blocks()

	return output

	@paddle.no_grad()
	def dual_guided(
	self,
	prompt: Union[PIL.Image.Image, List[PIL.Image.Image]],
	image: Union[str, List[str]],
	text_to_image_strength: float = 0.5,
	height: Optional[int] = None,
	width: Optional[int] = None,
	num_inference_steps: int = 50,
	guidance_scale: float = 7.5,
	num_images_per_prompt: Optional[int] = 1,
	eta: float = 0.0,
	generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None,
	latents: Optional[paddle.Tensor] = None,
	output_type: Optional[str] = "pil",
	return_dict: bool = True,
	callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None,
	callback_steps: Optional[int] = 1,
	):
	r"""
	Function invoked when calling the pipeline for generation.

	Args:
	prompt (`str` or `List[str]`):
	The prompt or prompts to guide the image generation.
	height (`int`, optional, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
	The height in pixels of the generated image.
	width (`int`, optional, defaults to self.image_unet.config.sample_size * self.vae_scale_factor):
	The width in pixels of the generated image.
	num_inference_steps (`int`, optional, defaults to 50):
	The number of denoising steps. More denoising steps usually lead to a higher quality image at the
	expense of slower inference.
	guidance_scale (`float`, optional, defaults to 7.5):
	Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
	`guidance_scale` is defined as `w` of equation 2. of [Imagen
	Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
	1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
	usually at the expense of lower image quality.
	negative_prompt (`str` or `List[str]`, optional):
	The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
	if `guidance_scale` is less than `1`).
	num_images_per_prompt (`int`, optional, defaults to 1):
	The number of images to generate per prompt.
	eta (`float`, optional, defaults to 0.0):
	Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
	[`schedulers.DDIMScheduler`], will be ignored for others.
	generator (`paddle.Generator`, optional):
	A [paddle generator] to make generation
	deterministic.
	latents (`paddle.Tensor`, optional):
	Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
	generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
	tensor will ge generated by sampling using the supplied random `generator`.
	output_type (`str`, optional, defaults to `"pil"`):
	The output format of the generate image. Choose between
	[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
	return_dict (`bool`, optional, defaults to `True`):
	Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
	plain tuple.
	callback (`Callable`, optional):
	A function that will be called every `callback_steps` steps during inference. The function will be
	called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`.
	callback_steps (`int`, optional, defaults to 1):
	The frequency at which the `callback` function will be called. If not specified, the callback will be
	called at every step.

	Examples:

	```py
	>>> from ppdiffusers import VersatileDiffusionPipeline
	>>> import paddle
	>>> import requests
	>>> from io import BytesIO
	>>> from PIL import Image

	>>> # let's download an initial image
	>>> url = "https://huggingface.co/datasets/diffusers/images/resolve/main/benz.jpg"

	>>> response = requests.get(url)
	>>> image = Image.open(BytesIO(response.content)).convert("RGB")
	>>> text = "a red car in the sun"

	>>> pipe = VersatileDiffusionPipeline.from_pretrained(
	... "shi-labs/versatile-diffusion"
	... )

	>>> generator = paddle.Generator().manual_seed(0)
	>>> text_to_image_strength = 0.75

	>>> image = pipe.dual_guided(
	... prompt=text, image=image, text_to_image_strength=text_to_image_strength, generator=generator
	... ).images[0]
	>>> image.save("./car_variation.png")
	```

	Returns:
	[`~pipelines.stable_diffusion.ImagePipelineOutput`] or `tuple`:
	[`~pipelines.stable_diffusion.ImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
	returning a tuple, the first element is a list with the generated images.
	"""

	expected_components = inspect.signature(VersatileDiffusionDualGuidedPipeline.__init__).parameters.keys()
	components = {name: component for name, component in self.components.items() if name in expected_components}
	temp_pipeline = VersatileDiffusionDualGuidedPipeline(**components)
	output = temp_pipeline(
	prompt=prompt,
	image=image,
	text_to_image_strength=text_to_image_strength,
	height=height,
	width=width,
	num_inference_steps=num_inference_steps,
	guidance_scale=guidance_scale,
	num_images_per_prompt=num_images_per_prompt,
	eta=eta,
	generator=generator,
	latents=latents,
	output_type=output_type,
	return_dict=return_dict,
	callback=callback,
	callback_steps=callback_steps,
	)
	temp_pipeline._revert_dual_attention()

	return output