34.1 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Perturbed-Attention Guidance","local":"perturbed-attention-guidance","sections":[{"title":"General tasks","local":"general-tasks","sections":[],"depth":2},{"title":"PAG with ControlNet","local":"pag-with-controlnet","sections":[],"depth":2},{"title":"PAG with IP-Adapter","local":"pag-with-ip-adapter","sections":[],"depth":2},{"title":"Configure parameters","local":"configure-parameters","sections":[{"title":"pag_applied_layers","local":"pagappliedlayers","sections":[],"depth":3}],"depth":2}],"depth":1}">
	<link href="/docs/diffusers/pr_12403/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/diffusers/pr_12403/en/_app/immutable/entry/start.33959e67.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_12403/en/_app/immutable/chunks/scheduler.8c3d61f6.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_12403/en/_app/immutable/chunks/singletons.46d5608c.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_12403/en/_app/immutable/chunks/index.0997d446.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_12403/en/_app/immutable/chunks/paths.0dc9c45f.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_12403/en/_app/immutable/entry/app.87796ad1.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_12403/en/_app/immutable/chunks/index.da70eac4.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_12403/en/_app/immutable/nodes/0.9198881c.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_12403/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_12403/en/_app/immutable/nodes/316.e6b7b1cd.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_12403/en/_app/immutable/chunks/Tip.6f698f24.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_12403/en/_app/immutable/chunks/CodeBlock.a9c4becf.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_12403/en/_app/immutable/chunks/getInferenceSnippets.ea1775db.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_12403/en/_app/immutable/chunks/HfOption.6c3b4e77.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Perturbed-Attention Guidance","local":"perturbed-attention-guidance","sections":[{"title":"General tasks","local":"general-tasks","sections":[],"depth":2},{"title":"PAG with ControlNet","local":"pag-with-controlnet","sections":[],"depth":2},{"title":"PAG with IP-Adapter","local":"pag-with-ip-adapter","sections":[],"depth":2},{"title":"Configure parameters","local":"configure-parameters","sections":[{"title":"pag_applied_layers","local":"pagappliedlayers","sections":[],"depth":3}],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="perturbed-attention-guidance" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#perturbed-attention-guidance"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Perturbed-Attention Guidance</span></h1> <p data-svelte-h="svelte-1w1c4u2"><a href="https://ku-cvlab.github.io/Perturbed-Attention-Guidance/" rel="nofollow">Perturbed-Attention Guidance (PAG)</a> is a new diffusion sampling guidance that improves sample quality across both unconditional and conditional settings, achieving this without requiring further training or the integration of external modules. PAG is designed to progressively enhance the structure of synthesized samples throughout the denoising process by considering the self-attention mechanisms’ ability to capture structural information. It involves generating intermediate samples with degraded structure by substituting selected self-attention maps in diffusion U-Net with an identity matrix, and guiding the denoising process away from these degraded samples.</p> <p data-svelte-h="svelte-11cdpcr">This guide will show you how to use PAG for various tasks and use cases.</p> <h2 class="relative group"><a id="general-tasks" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#general-tasks"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>General tasks</span></h2> <p data-svelte-h="svelte-1y4y4af">You can apply PAG to the <a href="/docs/diffusers/pr_12403/en/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline">StableDiffusionXLPipeline</a> for tasks such as text-to-image, image-to-image, and inpainting. To enable PAG for a specific task, load the pipeline using the <a href="../api/pipelines/auto_pipeline">AutoPipeline</a> API with the <code>enable_pag=True</code> flag and the <code>pag_applied_layers</code> argument.</p> <blockquote class="tip" data-svelte-h="svelte-1y7plbt"><p>🤗 Diffusers currently only supports using PAG with selected SDXL pipelines and <a href="/docs/diffusers/pr_12403/en/api/pipelines/pag#diffusers.PixArtSigmaPAGPipeline">PixArtSigmaPAGPipeline</a>. But feel free to open a <a href="https://github.com/huggingface/diffusers/issues/new/choose" rel="nofollow">feature request</a> if you want to add PAG support to a new pipeline!</p></blockquote> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">Text-to-image </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">Image-to-image </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">Inpainting </div></div> <div class="language-select"><div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> AutoPipelineForText2Image
	<span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> load_image
	<span class="hljs-keyword">import</span> torch

	pipeline = AutoPipelineForText2Image.from_pretrained(
	<span class="hljs-string">"stabilityai/stable-diffusion-xl-base-1.0"</span>,
	enable_pag=<span class="hljs-literal">True</span>,
	pag_applied_layers=[<span class="hljs-string">"mid"</span>],
	torch_dtype=torch.float16
	)
	pipeline.enable_model_cpu_offload()<!-- HTML_TAG_END --></pre></div> <blockquote class="tip" data-svelte-h="svelte-1qub4u5"><p>The <code>pag_applied_layers</code> argument allows you to specify which layers PAG is applied to. Additionally, you can use <code>set_pag_applied_layers</code> method to update these layers after the pipeline has been created. Check out the <a href="#pag_applied_layers">pag_applied_layers</a> section to learn more about applying PAG to other layers.</p></blockquote> <p data-svelte-h="svelte-ggenwh">If you already have a pipeline created and loaded, you can enable PAG on it using the <code>from_pipe</code> API with the <code>enable_pag</code> flag. Internally, a PAG pipeline is created based on the pipeline and task you specified. In the example below, since we used <code>AutoPipelineForText2Image</code> and passed a <code>StableDiffusionXLPipeline</code>, a <code>StableDiffusionXLPAGPipeline</code> is created accordingly. Note that this does not require additional memory, and you will have both <code>StableDiffusionXLPipeline</code> and <code>StableDiffusionXLPAGPipeline</code> loaded and ready to use. You can read more about the <code>from_pipe</code> API and how to reuse pipelines in diffuser <a href="https://huggingface.co/docs/diffusers/using-diffusers/loading#reuse-a-pipeline" rel="nofollow">here</a>.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pipeline_sdxl = AutoPipelineForText2Image.from_pretrained(<span class="hljs-string">"stabilityai/stable-diffusion-xl-base-1.0"</span>, torch_dtype=torch.float16)
	pipeline = AutoPipelineForText2Image.from_pipe(pipeline_sdxl, enable_pag=<span class="hljs-literal">True</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-43v4ib">To generate an image, you will also need to pass a <code>pag_scale</code>. When <code>pag_scale</code> increases, images gain more semantically coherent structures and exhibit fewer artifacts. However overly large guidance scale can lead to smoother textures and slight saturation in the images, similarly to CFG. <code>pag_scale=3.0</code> is used in the official demo and works well in most of the use cases, but feel free to experiment and select the appropriate value according to your needs! PAG is disabled when <code>pag_scale=0</code>.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->prompt = <span class="hljs-string">"an insect robot preparing a delicious meal, anime style"</span>

	<span class="hljs-keyword">for</span> pag_scale <span class="hljs-keyword">in</span> [<span class="hljs-number">0.0</span>, <span class="hljs-number">3.0</span>]:
	generator = torch.Generator(device=<span class="hljs-string">"cpu"</span>).manual_seed(<span class="hljs-number">0</span>)
	images = pipeline(
	prompt=prompt,
	num_inference_steps=<span class="hljs-number">25</span>,
	guidance_scale=<span class="hljs-number">7.0</span>,
	generator=generator,
	pag_scale=pag_scale,
	).images<!-- HTML_TAG_END --></pre></div> <div class="flex flex-row gap-4" data-svelte-h="svelte-1ebso2i"><div class="flex-1"><img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/pag_0.0_cfg_7.0_mid.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">generated image without PAG</figcaption></div> <div class="flex-1"><img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/pag_3.0_cfg_7.0_mid.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">generated image with PAG</figcaption></div></div> </div> <h2 class="relative group"><a id="pag-with-controlnet" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#pag-with-controlnet"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>PAG with ControlNet</span></h2> <p data-svelte-h="svelte-4m1dma">To use PAG with ControlNet, first create a <code>controlnet</code>. Then, pass the <code>controlnet</code> and other PAG arguments to the <code>from_pretrained</code> method of the AutoPipeline for the specified task.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> AutoPipelineForText2Image, ControlNetModel
	<span class="hljs-keyword">import</span> torch

	controlnet = ControlNetModel.from_pretrained(
	<span class="hljs-string">"diffusers/controlnet-canny-sdxl-1.0"</span>, torch_dtype=torch.float16
	)

	pipeline = AutoPipelineForText2Image.from_pretrained(
	<span class="hljs-string">"stabilityai/stable-diffusion-xl-base-1.0"</span>,
	controlnet=controlnet,
	enable_pag=<span class="hljs-literal">True</span>,
	pag_applied_layers=<span class="hljs-string">"mid"</span>,
	torch_dtype=torch.float16
	)
	pipeline.enable_model_cpu_offload()<!-- HTML_TAG_END --></pre></div> <blockquote class="tip"><p data-svelte-h="svelte-2vpeo4">If you already have a controlnet pipeline and want to enable PAG, you can use the <code>from_pipe</code> API: <code>AutoPipelineForText2Image.from_pipe(pipeline_controlnet, enable_pag=True)</code></p></blockquote> <p data-svelte-h="svelte-112ukzn">You can use the pipeline in the same way you normally use ControlNet pipelines, with the added option to specify a <code>pag_scale</code> parameter. Note that PAG works well for unconditional generation. In this example, we will generate an image without a prompt.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> load_image
	canny_image = load_image(
	<span class="hljs-string">"https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/pag_control_input.png"</span>
	)

	<span class="hljs-keyword">for</span> pag_scale <span class="hljs-keyword">in</span> [<span class="hljs-number">0.0</span>, <span class="hljs-number">3.0</span>]:
	generator = torch.Generator(device=<span class="hljs-string">"cpu"</span>).manual_seed(<span class="hljs-number">1</span>)
	images = pipeline(
	prompt=<span class="hljs-string">""</span>,
	controlnet_conditioning_scale=controlnet_conditioning_scale,
	image=canny_image,
	num_inference_steps=<span class="hljs-number">50</span>,
	guidance_scale=<span class="hljs-number">0</span>,
	generator=generator,
	pag_scale=pag_scale,
	).images
	images[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> <div class="flex flex-row gap-4" data-svelte-h="svelte-e2cxkg"><div class="flex-1"><img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/pag_0.0_controlnet.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">generated image without PAG</figcaption></div> <div class="flex-1"><img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/pag_3.0_controlnet.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">generated image with PAG</figcaption></div></div> <h2 class="relative group"><a id="pag-with-ip-adapter" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#pag-with-ip-adapter"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>PAG with IP-Adapter</span></h2> <p data-svelte-h="svelte-rjcwxp"><a href="https://hf.co/papers/2308.06721" rel="nofollow">IP-Adapter</a> is a popular model that can be plugged into diffusion models to enable image prompting without any changes to the underlying model. You can enable PAG on a pipeline with IP-Adapter loaded.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> AutoPipelineForText2Image
	<span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> load_image
	<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> CLIPVisionModelWithProjection
	<span class="hljs-keyword">import</span> torch

	image_encoder = CLIPVisionModelWithProjection.from_pretrained(
	<span class="hljs-string">"h94/IP-Adapter"</span>,
	subfolder=<span class="hljs-string">"models/image_encoder"</span>,
	torch_dtype=torch.float16
	)

	pipeline = AutoPipelineForText2Image.from_pretrained(
	<span class="hljs-string">"stabilityai/stable-diffusion-xl-base-1.0"</span>,
	image_encoder=image_encoder,
	enable_pag=<span class="hljs-literal">True</span>,
	torch_dtype=torch.float16
	).to(<span class="hljs-string">"cuda"</span>)

	pipeline.load_ip_adapter(<span class="hljs-string">"h94/IP-Adapter"</span>, subfolder=<span class="hljs-string">"sdxl_models"</span>, weight_name=<span class="hljs-string">"ip-adapter-plus_sdxl_vit-h.bin"</span>)

	pag_scales = <span class="hljs-number">5.0</span>
	ip_adapter_scales = <span class="hljs-number">0.8</span>

	image = load_image(<span class="hljs-string">"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_diner.png"</span>)

	pipeline.set_ip_adapter_scale(ip_adapter_scale)
	generator = torch.Generator(device=<span class="hljs-string">"cpu"</span>).manual_seed(<span class="hljs-number">0</span>)
	images = pipeline(
	prompt=<span class="hljs-string">"a polar bear sitting in a chair drinking a milkshake"</span>,
	ip_adapter_image=image,
	negative_prompt=<span class="hljs-string">"deformed, ugly, wrong proportion, low res, bad anatomy, worst quality, low quality"</span>,
	num_inference_steps=<span class="hljs-number">25</span>,
	guidance_scale=<span class="hljs-number">3.0</span>,
	generator=generator,
	pag_scale=pag_scale,
	).images
	images[<span class="hljs-number">0</span>]
	<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1n1gw76">PAG reduces artifacts and improves the overall compposition.</p> <div class="flex flex-row gap-4" data-svelte-h="svelte-j0lguw"><div class="flex-1"><img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/pag_0.0_ipa_0.8.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">generated image without PAG</figcaption></div> <div class="flex-1"><img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/pag_5.0_ipa_0.8.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">generated image with PAG</figcaption></div></div> <h2 class="relative group"><a id="configure-parameters" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#configure-parameters"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Configure parameters</span></h2> <h3 class="relative group"><a id="pagappliedlayers" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#pagappliedlayers"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>pag_applied_layers</span></h3> <p data-svelte-h="svelte-1th38n0">The <code>pag_applied_layers</code> argument allows you to specify which layers PAG is applied to. By default, it applies only to the mid blocks. Changing this setting will significantly impact the output. You can use the <code>set_pag_applied_layers</code> method to adjust the PAG layers after the pipeline is created, helping you find the optimal layers for your model.</p> <p data-svelte-h="svelte-1d06cik">As an example, here is the images generated with <code>pag_layers = ["down.block_2"]</code> and <code>pag_layers = ["down.block_2", "up.block_1.attentions_0"]</code></p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->prompt = <span class="hljs-string">"an insect robot preparing a delicious meal, anime style"</span>
	pipeline.set_pag_applied_layers(pag_layers)
	generator = torch.Generator(device=<span class="hljs-string">"cpu"</span>).manual_seed(<span class="hljs-number">0</span>)
	images = pipeline(
	prompt=prompt,
	num_inference_steps=<span class="hljs-number">25</span>,
	guidance_scale=guidance_scale,
	generator=generator,
	pag_scale=pag_scale,
	).images
	images[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> <div class="flex flex-row gap-4" data-svelte-h="svelte-1srsjub"><div class="flex-1"><img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/pag_3.0_cfg_7.0_down2_up1a0.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">down.block_2 + up.block1.attentions_0</figcaption></div> <div class="flex-1"><img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/pag_3.0_cfg_7.0_down2.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">down.block_2</figcaption></div></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/diffusers/blob/main/docs/source/en/using-diffusers/pag.md" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_g87enx = {
	assets: "/docs/diffusers/pr_12403/en",
	base: "/docs/diffusers/pr_12403/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/diffusers/pr_12403/en/_app/immutable/entry/start.33959e67.js"),
	import("/docs/diffusers/pr_12403/en/_app/immutable/entry/app.87796ad1.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 316],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 34.1 kB
Xet hash:: 018e46db9e9846cd45e26ce27019f85a58a04e27ccca4cba5bcb9d6fb428f3b7

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.