87.1 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"IP-Adapter","local":"ip-adapter","sections":[{"title":"General tasks","local":"general-tasks","sections":[],"depth":2},{"title":"Configure parameters","local":"configure-parameters","sections":[{"title":"Image embeddings","local":"image-embeddings","sections":[],"depth":3},{"title":"IP-Adapter masking","local":"ip-adapter-masking","sections":[],"depth":3}],"depth":2},{"title":"Specific use cases","local":"specific-use-cases","sections":[{"title":"Face model","local":"face-model","sections":[],"depth":3},{"title":"Multi IP-Adapter","local":"multi-ip-adapter","sections":[],"depth":3},{"title":"Instant generation","local":"instant-generation","sections":[],"depth":3},{"title":"Structural control","local":"structural-control","sections":[],"depth":3},{"title":"Style & layout control","local":"style--layout-control","sections":[],"depth":3}],"depth":2}],"depth":1}">
	<link href="/docs/diffusers/pr_10567/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/diffusers/pr_10567/en/_app/immutable/entry/start.5ab964f0.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_10567/en/_app/immutable/chunks/scheduler.8c3d61f6.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_10567/en/_app/immutable/chunks/singletons.1271a703.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_10567/en/_app/immutable/chunks/index.0997d446.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_10567/en/_app/immutable/chunks/paths.af967ee5.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_10567/en/_app/immutable/entry/app.d83dbfce.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_10567/en/_app/immutable/chunks/index.da70eac4.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_10567/en/_app/immutable/nodes/0.bb4a0671.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_10567/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_10567/en/_app/immutable/nodes/247.d33a3667.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_10567/en/_app/immutable/chunks/Tip.1d9b8c37.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_10567/en/_app/immutable/chunks/CodeBlock.00a903b3.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_10567/en/_app/immutable/chunks/EditOnGithub.1e64e623.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_10567/en/_app/immutable/chunks/HfOption.c1483eb1.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_10567/en/_app/immutable/chunks/stores.d6eecc38.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"IP-Adapter","local":"ip-adapter","sections":[{"title":"General tasks","local":"general-tasks","sections":[],"depth":2},{"title":"Configure parameters","local":"configure-parameters","sections":[{"title":"Image embeddings","local":"image-embeddings","sections":[],"depth":3},{"title":"IP-Adapter masking","local":"ip-adapter-masking","sections":[],"depth":3}],"depth":2},{"title":"Specific use cases","local":"specific-use-cases","sections":[{"title":"Face model","local":"face-model","sections":[],"depth":3},{"title":"Multi IP-Adapter","local":"multi-ip-adapter","sections":[],"depth":3},{"title":"Instant generation","local":"instant-generation","sections":[],"depth":3},{"title":"Structural control","local":"structural-control","sections":[],"depth":3},{"title":"Style & layout control","local":"style--layout-control","sections":[],"depth":3}],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="ip-adapter" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#ip-adapter"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>IP-Adapter</span></h1> <p data-svelte-h="svelte-rd9njz"><a href="https://hf.co/papers/2308.06721" rel="nofollow">IP-Adapter</a> is an image prompt adapter that can be plugged into diffusion models to enable image prompting without any changes to the underlying model. Furthermore, this adapter can be reused with other models finetuned from the same base model and it can be combined with other adapters like <a href="../using-diffusers/controlnet">ControlNet</a>. The key idea behind IP-Adapter is the <em>decoupled cross-attention</em> mechanism which adds a separate cross-attention layer just for image features instead of using the same cross-attention layer for both text and image features. This allows the model to learn more image-specific features.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1lc7m22">Learn how to load an IP-Adapter in the <a href="../using-diffusers/loading_adapters#ip-adapter">Load adapters</a> guide, and make sure you check out the <a href="../using-diffusers/loading_adapters#ip-adapter-plus">IP-Adapter Plus</a> section which requires manually loading the image encoder.</p></div> <p data-svelte-h="svelte-1211s0v">This guide will walk you through using IP-Adapter for various tasks and use cases.</p> <h2 class="relative group"><a id="general-tasks" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#general-tasks"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>General tasks</span></h2> <p data-svelte-h="svelte-nn8jl7">Let’s take a look at how to use IP-Adapter’s image prompting capabilities with the <a href="/docs/diffusers/pr_10567/en/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline">StableDiffusionXLPipeline</a> for tasks like text-to-image, image-to-image, and inpainting. We also encourage you to try out other pipelines such as Stable Diffusion, LCM-LoRA, ControlNet, T2I-Adapter, or AnimateDiff!</p> <p data-svelte-h="svelte-1easplw">In all the following examples, you’ll see the <a href="/docs/diffusers/pr_10567/en/api/loaders/ip_adapter#diffusers.loaders.IPAdapterMixin.set_ip_adapter_scale">set_ip_adapter_scale()</a> method. This method controls the amount of text or image conditioning to apply to the model. A value of <code>1.0</code> means the model is only conditioned on the image prompt. Lowering this value encourages the model to produce more diverse images, but they may not be as aligned with the image prompt. Typically, a value of <code>0.5</code> achieves a good balance between the two prompt types and produces good results.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-18zjnah">In the examples below, try adding <code>low_cpu_mem_usage=True</code> to the <a href="/docs/diffusers/pr_10567/en/api/loaders/ip_adapter#diffusers.loaders.IPAdapterMixin.load_ip_adapter">load_ip_adapter()</a> method to speed up the loading time.</p></div> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">Text-to-image </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">Image-to-image </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">Inpainting </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">Video </div></div> <div class="language-select"><p data-svelte-h="svelte-1kuilfu">Crafting the precise text prompt to generate the image you want can be difficult because it may not always capture what you’d like to express. Adding an image alongside the text prompt helps the model better understand what it should generate and can lead to more accurate results.</p> <p data-svelte-h="svelte-qqjbyj">Load a Stable Diffusion XL (SDXL) model and insert an IP-Adapter into the model with the <a href="/docs/diffusers/pr_10567/en/api/loaders/ip_adapter#diffusers.loaders.IPAdapterMixin.load_ip_adapter">load_ip_adapter()</a> method. Use the <code>subfolder</code> parameter to load the SDXL model weights.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> AutoPipelineForText2Image
	<span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> load_image
	<span class="hljs-keyword">import</span> torch

	pipeline = AutoPipelineForText2Image.from_pretrained(<span class="hljs-string">"stabilityai/stable-diffusion-xl-base-1.0"</span>, torch_dtype=torch.float16).to(<span class="hljs-string">"cuda"</span>)
	pipeline.load_ip_adapter(<span class="hljs-string">"h94/IP-Adapter"</span>, subfolder=<span class="hljs-string">"sdxl_models"</span>, weight_name=<span class="hljs-string">"ip-adapter_sdxl.bin"</span>)
	pipeline.set_ip_adapter_scale(<span class="hljs-number">0.6</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-yvbljp">Create a text prompt and load an image prompt before passing them to the pipeline to generate an image.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->image = load_image(<span class="hljs-string">"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_diner.png"</span>)
	generator = torch.Generator(device=<span class="hljs-string">"cpu"</span>).manual_seed(<span class="hljs-number">0</span>)
	images = pipeline(
	prompt=<span class="hljs-string">"a polar bear sitting in a chair drinking a milkshake"</span>,
	ip_adapter_image=image,
	negative_prompt=<span class="hljs-string">"deformed, ugly, wrong proportion, low res, bad anatomy, worst quality, low quality"</span>,
	num_inference_steps=<span class="hljs-number">100</span>,
	generator=generator,
	).images
	images[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> <div class="flex flex-row gap-4" data-svelte-h="svelte-4jgjc0"><div class="flex-1"><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_diner.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter image</figcaption></div> <div class="flex-1"><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_diner_2.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption></div></div> </div> <h2 class="relative group"><a id="configure-parameters" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#configure-parameters"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Configure parameters</span></h2> <p data-svelte-h="svelte-1ytpuar">There are a couple of IP-Adapter parameters that are useful to know about and can help you with your image generation tasks. These parameters can make your workflow more efficient or give you more control over image generation.</p> <h3 class="relative group"><a id="image-embeddings" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#image-embeddings"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Image embeddings</span></h3> <p data-svelte-h="svelte-csxost">IP-Adapter enabled pipelines provide the <code>ip_adapter_image_embeds</code> parameter to accept precomputed image embeddings. This is particularly useful in scenarios where you need to run the IP-Adapter pipeline multiple times because you have more than one image. For example, <a href="#multi-ip-adapter">multi IP-Adapter</a> is a specific use case where you provide multiple styling images to generate a specific image in a specific style. Loading and encoding multiple images each time you use the pipeline would be inefficient. Instead, you can precompute and save the image embeddings to disk (which can save a lot of space if you’re using high-quality images) and load them when you need them.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1rk1t7q">This parameter also gives you the flexibility to load embeddings from other sources. For example, ComfyUI image embeddings for IP-Adapters are compatible with Diffusers and should work ouf-of-the-box!</p></div> <p data-svelte-h="svelte-1h2tzec">Call the <code>prepare_ip_adapter_image_embeds()</code> method to encode and generate the image embeddings. Then you can save them to disk with <code>torch.save</code>.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-h0lw9v">If you’re using IP-Adapter with <code>ip_adapter_image_embedding</code> instead of <code>ip_adapter_image</code>’, you can set <code>load_ip_adapter(image_encoder_folder=None,...)</code> because you don’t need to load an encoder to generate the image embeddings.</p></div> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->image_embeds = pipeline.prepare_ip_adapter_image_embeds(
	ip_adapter_image=image,
	ip_adapter_image_embeds=<span class="hljs-literal">None</span>,
	device=<span class="hljs-string">"cuda"</span>,
	num_images_per_prompt=<span class="hljs-number">1</span>,
	do_classifier_free_guidance=<span class="hljs-literal">True</span>,
	)

	torch.save(image_embeds, <span class="hljs-string">"image_embeds.ipadpt"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1nluq5y">Now load the image embeddings by passing them to the <code>ip_adapter_image_embeds</code> parameter.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->image_embeds = torch.load(<span class="hljs-string">"image_embeds.ipadpt"</span>)
	images = pipeline(
	prompt=<span class="hljs-string">"a polar bear sitting in a chair drinking a milkshake"</span>,
	ip_adapter_image_embeds=image_embeds,
	negative_prompt=<span class="hljs-string">"deformed, ugly, wrong proportion, low res, bad anatomy, worst quality, low quality"</span>,
	num_inference_steps=<span class="hljs-number">100</span>,
	generator=generator,
	).images<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="ip-adapter-masking" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#ip-adapter-masking"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>IP-Adapter masking</span></h3> <p data-svelte-h="svelte-1jbd2l3">Binary masks specify which portion of the output image should be assigned to an IP-Adapter. This is useful for composing more than one IP-Adapter image. For each input IP-Adapter image, you must provide a binary mask.</p> <p data-svelte-h="svelte-ci17qi">To start, preprocess the input IP-Adapter images with the <code>~image_processor.IPAdapterMaskProcessor.preprocess()</code> to generate their masks. For optimal results, provide the output height and width to <code>~image_processor.IPAdapterMaskProcessor.preprocess()</code>. This ensures masks with different aspect ratios are appropriately stretched. If the input masks already match the aspect ratio of the generated image, you don’t have to set the <code>height</code> and <code>width</code>.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> diffusers.image_processor <span class="hljs-keyword">import</span> IPAdapterMaskProcessor

	mask1 = load_image(<span class="hljs-string">"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_mask_mask1.png"</span>)
	mask2 = load_image(<span class="hljs-string">"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_mask_mask2.png"</span>)

	output_height = <span class="hljs-number">1024</span>
	output_width = <span class="hljs-number">1024</span>

	processor = IPAdapterMaskProcessor()
	masks = processor.preprocess([mask1, mask2], height=output_height, width=output_width)<!-- HTML_TAG_END --></pre></div> <div class="flex flex-row gap-4" data-svelte-h="svelte-eifaa"><div class="flex-1"><img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_mask_mask1.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">mask one</figcaption></div> <div class="flex-1"><img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_mask_mask2.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">mask two</figcaption></div></div> <p data-svelte-h="svelte-ep1q3l">When there is more than one input IP-Adapter image, load them as a list and provide the IP-Adapter scale list. Each of the input IP-Adapter images here corresponds to one of the masks generated above.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pipeline.load_ip_adapter(<span class="hljs-string">"h94/IP-Adapter"</span>, subfolder=<span class="hljs-string">"sdxl_models"</span>, weight_name=[<span class="hljs-string">"ip-adapter-plus-face_sdxl_vit-h.safetensors"</span>])
	pipeline.set_ip_adapter_scale([[<span class="hljs-number">0.7</span>, <span class="hljs-number">0.7</span>]]) <span class="hljs-comment"># one scale for each image-mask pair</span>

	face_image1 = load_image(<span class="hljs-string">"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_mask_girl1.png"</span>)
	face_image2 = load_image(<span class="hljs-string">"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_mask_girl2.png"</span>)

	ip_images = [[face_image1, face_image2]]

	masks = [masks.reshape(<span class="hljs-number">1</span>, masks.shape[<span class="hljs-number">0</span>], masks.shape[<span class="hljs-number">2</span>], masks.shape[<span class="hljs-number">3</span>])]<!-- HTML_TAG_END --></pre></div> <div class="flex flex-row gap-4" data-svelte-h="svelte-m35a5e"><div class="flex-1"><img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_mask_girl1.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter image one</figcaption></div> <div class="flex-1"><img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_mask_girl2.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter image two</figcaption></div></div> <p data-svelte-h="svelte-1oy5psd">Now pass the preprocessed masks to <code>cross_attention_kwargs</code> in the pipeline call.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->generator = torch.Generator(device=<span class="hljs-string">"cpu"</span>).manual_seed(<span class="hljs-number">0</span>)
	num_images = <span class="hljs-number">1</span>

	image = pipeline(
	prompt=<span class="hljs-string">"2 girls"</span>,
	ip_adapter_image=ip_images,
	negative_prompt=<span class="hljs-string">"monochrome, lowres, bad anatomy, worst quality, low quality"</span>,
	num_inference_steps=<span class="hljs-number">20</span>,
	num_images_per_prompt=num_images,
	generator=generator,
	cross_attention_kwargs={<span class="hljs-string">"ip_adapter_masks"</span>: masks}
	).images[<span class="hljs-number">0</span>]
	image<!-- HTML_TAG_END --></pre></div> <div class="flex flex-row gap-4" data-svelte-h="svelte-1rjzzk0"><div class="flex-1"><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_attention_mask_result_seed_0.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter masking applied</figcaption></div> <div class="flex-1"><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_no_attention_mask_result_seed_0.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">no IP-Adapter masking applied</figcaption></div></div> <h2 class="relative group"><a id="specific-use-cases" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#specific-use-cases"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Specific use cases</span></h2> <p data-svelte-h="svelte-3uurtn">IP-Adapter’s image prompting and compatibility with other adapters and models makes it a versatile tool for a variety of use cases. This section covers some of the more popular applications of IP-Adapter, and we can’t wait to see what you come up with!</p> <h3 class="relative group"><a id="face-model" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#face-model"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Face model</span></h3> <p data-svelte-h="svelte-1hojnjd">Generating accurate faces is challenging because they are complex and nuanced. Diffusers supports two IP-Adapter checkpoints specifically trained to generate faces from the <a href="https://huggingface.co/h94/IP-Adapter" rel="nofollow">h94/IP-Adapter</a> repository:</p> <ul data-svelte-h="svelte-1a56pqx"><li><a href="https://huggingface.co/h94/IP-Adapter/blob/main/models/ip-adapter-full-face_sd15.safetensors" rel="nofollow">ip-adapter-full-face_sd15.safetensors</a> is conditioned with images of cropped faces and removed backgrounds</li> <li><a href="https://huggingface.co/h94/IP-Adapter/blob/main/models/ip-adapter-plus-face_sd15.safetensors" rel="nofollow">ip-adapter-plus-face_sd15.safetensors</a> uses patch embeddings and is conditioned with images of cropped faces</li></ul> <p data-svelte-h="svelte-1ylc0is">Additionally, Diffusers supports all IP-Adapter checkpoints trained with face embeddings extracted by <code>insightface</code> face models. Supported models are from the <a href="https://huggingface.co/h94/IP-Adapter-FaceID" rel="nofollow">h94/IP-Adapter-FaceID</a> repository.</p> <p data-svelte-h="svelte-1p2g7i8">For face models, use the <a href="https://huggingface.co/h94/IP-Adapter" rel="nofollow">h94/IP-Adapter</a> checkpoint. It is also recommended to use <a href="/docs/diffusers/pr_10567/en/api/schedulers/ddim#diffusers.DDIMScheduler">DDIMScheduler</a> or <a href="/docs/diffusers/pr_10567/en/api/schedulers/euler#diffusers.EulerDiscreteScheduler">EulerDiscreteScheduler</a> for face models.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline, DDIMScheduler
	<span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> load_image

	pipeline = StableDiffusionPipeline.from_pretrained(
	<span class="hljs-string">"stable-diffusion-v1-5/stable-diffusion-v1-5"</span>,
	torch_dtype=torch.float16,
	).to(<span class="hljs-string">"cuda"</span>)
	pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
	pipeline.load_ip_adapter(<span class="hljs-string">"h94/IP-Adapter"</span>, subfolder=<span class="hljs-string">"models"</span>, weight_name=<span class="hljs-string">"ip-adapter-full-face_sd15.bin"</span>)

	pipeline.set_ip_adapter_scale(<span class="hljs-number">0.5</span>)

	image = load_image(<span class="hljs-string">"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_einstein_base.png"</span>)
	generator = torch.Generator(device=<span class="hljs-string">"cpu"</span>).manual_seed(<span class="hljs-number">26</span>)

	image = pipeline(
	prompt=<span class="hljs-string">"A photo of Einstein as a chef, wearing an apron, cooking in a French restaurant"</span>,
	ip_adapter_image=image,
	negative_prompt=<span class="hljs-string">"lowres, bad anatomy, worst quality, low quality"</span>,
	num_inference_steps=<span class="hljs-number">100</span>,
	generator=generator,
	).images[<span class="hljs-number">0</span>]
	image<!-- HTML_TAG_END --></pre></div> <div class="flex flex-row gap-4" data-svelte-h="svelte-lbwedl"><div class="flex-1"><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_einstein_base.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter image</figcaption></div> <div class="flex-1"><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_einstein.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption></div></div> <p data-svelte-h="svelte-10x245s">To use IP-Adapter FaceID models, first extract face embeddings with <code>insightface</code>. Then pass the list of tensors to the pipeline as <code>ip_adapter_image_embeds</code>.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionPipeline, DDIMScheduler
	<span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> load_image
	<span class="hljs-keyword">from</span> insightface.app <span class="hljs-keyword">import</span> FaceAnalysis

	pipeline = StableDiffusionPipeline.from_pretrained(
	<span class="hljs-string">"stable-diffusion-v1-5/stable-diffusion-v1-5"</span>,
	torch_dtype=torch.float16,
	).to(<span class="hljs-string">"cuda"</span>)
	pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
	pipeline.load_ip_adapter(<span class="hljs-string">"h94/IP-Adapter-FaceID"</span>, subfolder=<span class="hljs-literal">None</span>, weight_name=<span class="hljs-string">"ip-adapter-faceid_sd15.bin"</span>, image_encoder_folder=<span class="hljs-literal">None</span>)
	pipeline.set_ip_adapter_scale(<span class="hljs-number">0.6</span>)

	image = load_image(<span class="hljs-string">"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_mask_girl1.png"</span>)

	ref_images_embeds = []
	app = FaceAnalysis(name=<span class="hljs-string">"buffalo_l"</span>, providers=[<span class="hljs-string">'CUDAExecutionProvider'</span>, <span class="hljs-string">'CPUExecutionProvider'</span>])
	app.prepare(ctx_id=<span class="hljs-number">0</span>, det_size=(<span class="hljs-number">640</span>, <span class="hljs-number">640</span>))
	image = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2RGB)
	faces = app.get(image)
	image = torch.from_numpy(faces[<span class="hljs-number">0</span>].normed_embedding)
	ref_images_embeds.append(image.unsqueeze(<span class="hljs-number">0</span>))
	ref_images_embeds = torch.stack(ref_images_embeds, dim=<span class="hljs-number">0</span>).unsqueeze(<span class="hljs-number">0</span>)
	neg_ref_images_embeds = torch.zeros_like(ref_images_embeds)
	id_embeds = torch.cat([neg_ref_images_embeds, ref_images_embeds]).to(dtype=torch.float16, device=<span class="hljs-string">"cuda"</span>)

	generator = torch.Generator(device=<span class="hljs-string">"cpu"</span>).manual_seed(<span class="hljs-number">42</span>)

	images = pipeline(
	prompt=<span class="hljs-string">"A photo of a girl"</span>,
	ip_adapter_image_embeds=[id_embeds],
	negative_prompt=<span class="hljs-string">"monochrome, lowres, bad anatomy, worst quality, low quality"</span>,
	num_inference_steps=<span class="hljs-number">20</span>, num_images_per_prompt=<span class="hljs-number">1</span>,
	generator=generator
	).images<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1mw36co">Both IP-Adapter FaceID Plus and Plus v2 models require CLIP image embeddings. You can prepare face embeddings as shown previously, then you can extract and pass CLIP embeddings to the hidden image projection layers.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> insightface.utils <span class="hljs-keyword">import</span> face_align

	ref_images_embeds = []
	ip_adapter_images = []
	app = FaceAnalysis(name=<span class="hljs-string">"buffalo_l"</span>, providers=[<span class="hljs-string">'CUDAExecutionProvider'</span>, <span class="hljs-string">'CPUExecutionProvider'</span>])
	app.prepare(ctx_id=<span class="hljs-number">0</span>, det_size=(<span class="hljs-number">640</span>, <span class="hljs-number">640</span>))
	image = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2RGB)
	faces = app.get(image)
	ip_adapter_images.append(face_align.norm_crop(image, landmark=faces[<span class="hljs-number">0</span>].kps, image_size=<span class="hljs-number">224</span>))
	image = torch.from_numpy(faces[<span class="hljs-number">0</span>].normed_embedding)
	ref_images_embeds.append(image.unsqueeze(<span class="hljs-number">0</span>))
	ref_images_embeds = torch.stack(ref_images_embeds, dim=<span class="hljs-number">0</span>).unsqueeze(<span class="hljs-number">0</span>)
	neg_ref_images_embeds = torch.zeros_like(ref_images_embeds)
	id_embeds = torch.cat([neg_ref_images_embeds, ref_images_embeds]).to(dtype=torch.float16, device=<span class="hljs-string">"cuda"</span>)

	clip_embeds = pipeline.prepare_ip_adapter_image_embeds(
	[ip_adapter_images], <span class="hljs-literal">None</span>, torch.device(<span class="hljs-string">"cuda"</span>), num_images, <span class="hljs-literal">True</span>)[<span class="hljs-number">0</span>]

	pipeline.unet.encoder_hid_proj.image_projection_layers[<span class="hljs-number">0</span>].clip_embeds = clip_embeds.to(dtype=torch.float16)
	pipeline.unet.encoder_hid_proj.image_projection_layers[<span class="hljs-number">0</span>].shortcut = <span class="hljs-literal">False</span> <span class="hljs-comment"># True if Plus v2</span><!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="multi-ip-adapter" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#multi-ip-adapter"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Multi IP-Adapter</span></h3> <p data-svelte-h="svelte-1tk2m7v">More than one IP-Adapter can be used at the same time to generate specific images in more diverse styles. For example, you can use IP-Adapter-Face to generate consistent faces and characters, and IP-Adapter Plus to generate those faces in a specific style.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-apa7ha">Read the <a href="../using-diffusers/loading_adapters#ip-adapter-plus">IP-Adapter Plus</a> section to learn why you need to manually load the image encoder.</p></div> <p data-svelte-h="svelte-qgg0iy">Load the image encoder with <a href="https://huggingface.co/docs/transformers/main/en/model_doc/clip#transformers.CLIPVisionModelWithProjection" rel="nofollow">CLIPVisionModelWithProjection</a>.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> AutoPipelineForText2Image, DDIMScheduler
	<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> CLIPVisionModelWithProjection
	<span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> load_image

	image_encoder = CLIPVisionModelWithProjection.from_pretrained(
	<span class="hljs-string">"h94/IP-Adapter"</span>,
	subfolder=<span class="hljs-string">"models/image_encoder"</span>,
	torch_dtype=torch.float16,
	)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1vw2kqb">Next, you’ll load a base model, scheduler, and the IP-Adapters. The IP-Adapters to use are passed as a list to the <code>weight_name</code> parameter:</p> <ul data-svelte-h="svelte-tpz2u9"><li><a href="https://huggingface.co/h94/IP-Adapter#ip-adapter-for-sdxl-10" rel="nofollow">ip-adapter-plus_sdxl_vit-h</a> uses patch embeddings and a ViT-H image encoder</li> <li><a href="https://huggingface.co/h94/IP-Adapter#ip-adapter-for-sdxl-10" rel="nofollow">ip-adapter-plus-face_sdxl_vit-h</a> has the same architecture but it is conditioned with images of cropped faces</li></ul> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pipeline = AutoPipelineForText2Image.from_pretrained(
	<span class="hljs-string">"stabilityai/stable-diffusion-xl-base-1.0"</span>,
	torch_dtype=torch.float16,
	image_encoder=image_encoder,
	)
	pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
	pipeline.load_ip_adapter(
	<span class="hljs-string">"h94/IP-Adapter"</span>,
	subfolder=<span class="hljs-string">"sdxl_models"</span>,
	weight_name=[<span class="hljs-string">"ip-adapter-plus_sdxl_vit-h.safetensors"</span>, <span class="hljs-string">"ip-adapter-plus-face_sdxl_vit-h.safetensors"</span>]
	)
	pipeline.set_ip_adapter_scale([<span class="hljs-number">0.7</span>, <span class="hljs-number">0.3</span>])
	pipeline.enable_model_cpu_offload()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1p6lwr0">Load an image prompt and a folder containing images of a certain style you want to use.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->face_image = load_image(<span class="hljs-string">"https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/women_input.png"</span>)
	style_folder = <span class="hljs-string">"https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/style_ziggy"</span>
	style_images = [load_image(<span class="hljs-string">f"<span class="hljs-subst">{style_folder}</span>/img<span class="hljs-subst">{i}</span>.png"</span>) <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">10</span>)]<!-- HTML_TAG_END --></pre></div> <div class="flex flex-row gap-4" data-svelte-h="svelte-im3du6"><div class="flex-1"><img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/women_input.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter image of face</figcaption></div> <div class="flex-1"><img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_style_grid.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter style images</figcaption></div></div> <p data-svelte-h="svelte-1kkzavx">Pass the image prompt and style images as a list to the <code>ip_adapter_image</code> parameter, and run the pipeline!</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->generator = torch.Generator(device=<span class="hljs-string">"cpu"</span>).manual_seed(<span class="hljs-number">0</span>)

	image = pipeline(
	prompt=<span class="hljs-string">"wonderwoman"</span>,
	ip_adapter_image=[style_images, face_image],
	negative_prompt=<span class="hljs-string">"monochrome, lowres, bad anatomy, worst quality, low quality"</span>,
	num_inference_steps=<span class="hljs-number">50</span>, num_images_per_prompt=<span class="hljs-number">1</span>,
	generator=generator,
	).images[<span class="hljs-number">0</span>]
	image<!-- HTML_TAG_END --></pre></div> <div class="flex justify-center" data-svelte-h="svelte-1t1eqqe"> <img src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_multi_out.png"></div> <h3 class="relative group"><a id="instant-generation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#instant-generation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Instant generation</span></h3> <p data-svelte-h="svelte-1am74qu"><a href="../using-diffusers/inference_with_lcm_lora">Latent Consistency Models (LCM)</a> are diffusion models that can generate images in as little as 4 steps compared to other diffusion models like SDXL that typically require way more steps. This is why image generation with an LCM feels “instantaneous”. IP-Adapters can be plugged into an LCM-LoRA model to instantly generate images with an image prompt.</p> <p data-svelte-h="svelte-yf1xoc">The IP-Adapter weights need to be loaded first, then you can use <a href="/docs/diffusers/pr_10567/en/api/loaders/lora#diffusers.loaders.StableDiffusionLoraLoaderMixin.load_lora_weights">load_lora_weights()</a> to load the LoRA style and weight you want to apply to your image.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> DiffusionPipeline, LCMScheduler
	<span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> load_image

	model_id = <span class="hljs-string">"sd-dreambooth-library/herge-style"</span>
	lcm_lora_id = <span class="hljs-string">"latent-consistency/lcm-lora-sdv1-5"</span>

	pipeline = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)

	pipeline.load_ip_adapter(<span class="hljs-string">"h94/IP-Adapter"</span>, subfolder=<span class="hljs-string">"models"</span>, weight_name=<span class="hljs-string">"ip-adapter_sd15.bin"</span>)
	pipeline.load_lora_weights(lcm_lora_id)
	pipeline.scheduler = LCMScheduler.from_config(pipeline.scheduler.config)
	pipeline.enable_model_cpu_offload()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-13u2z8i">Try using with a lower IP-Adapter scale to condition image generation more on the <a href="https://huggingface.co/sd-dreambooth-library/herge-style" rel="nofollow">herge_style</a> checkpoint, and remember to use the special token <code>herge_style</code> in your prompt to trigger and apply the style.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pipeline.set_ip_adapter_scale(<span class="hljs-number">0.4</span>)

	prompt = <span class="hljs-string">"herge_style woman in armor, best quality, high quality"</span>
	generator = torch.Generator(device=<span class="hljs-string">"cpu"</span>).manual_seed(<span class="hljs-number">0</span>)

	ip_adapter_image = load_image(<span class="hljs-string">"https://user-images.githubusercontent.com/24734142/266492875-2d50d223-8475-44f0-a7c6-08b51cb53572.png"</span>)
	image = pipeline(
	prompt=prompt,
	ip_adapter_image=ip_adapter_image,
	num_inference_steps=<span class="hljs-number">4</span>,
	guidance_scale=<span class="hljs-number">1</span>,
	).images[<span class="hljs-number">0</span>]
	image<!-- HTML_TAG_END --></pre></div> <div class="flex justify-center" data-svelte-h="svelte-1cw2b12"> <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_herge.png"></div> <h3 class="relative group"><a id="structural-control" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#structural-control"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Structural control</span></h3> <p data-svelte-h="svelte-sz8jif">To control image generation to an even greater degree, you can combine IP-Adapter with a model like <a href="../using-diffusers/controlnet">ControlNet</a>. A ControlNet is also an adapter that can be inserted into a diffusion model to allow for conditioning on an additional control image. The control image can be depth maps, edge maps, pose estimations, and more.</p> <p data-svelte-h="svelte-b52ker">Load a <a href="/docs/diffusers/pr_10567/en/api/models/controlnet#diffusers.ControlNetModel">ControlNetModel</a> checkpoint conditioned on depth maps, insert it into a diffusion model, and load the IP-Adapter.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionControlNetPipeline, ControlNetModel
	<span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> load_image

	controlnet_model_path = <span class="hljs-string">"lllyasviel/control_v11f1p_sd15_depth"</span>
	controlnet = ControlNetModel.from_pretrained(controlnet_model_path, torch_dtype=torch.float16)

	pipeline = StableDiffusionControlNetPipeline.from_pretrained(
	<span class="hljs-string">"stable-diffusion-v1-5/stable-diffusion-v1-5"</span>, controlnet=controlnet, torch_dtype=torch.float16)
	pipeline.to(<span class="hljs-string">"cuda"</span>)
	pipeline.load_ip_adapter(<span class="hljs-string">"h94/IP-Adapter"</span>, subfolder=<span class="hljs-string">"models"</span>, weight_name=<span class="hljs-string">"ip-adapter_sd15.bin"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-77tfin">Now load the IP-Adapter image and depth map.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->ip_adapter_image = load_image(<span class="hljs-string">"https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/statue.png"</span>)
	depth_map = load_image(<span class="hljs-string">"https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/depth.png"</span>)<!-- HTML_TAG_END --></pre></div> <div class="flex flex-row gap-4" data-svelte-h="svelte-mjuwpp"><div class="flex-1"><img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/statue.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter image</figcaption></div> <div class="flex-1"><img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/depth.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">depth map</figcaption></div></div> <p data-svelte-h="svelte-12fzw16">Pass the depth map and IP-Adapter image to the pipeline to generate an image.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->generator = torch.Generator(device=<span class="hljs-string">"cpu"</span>).manual_seed(<span class="hljs-number">33</span>)
	image = pipeline(
	prompt=<span class="hljs-string">"best quality, high quality"</span>,
	image=depth_map,
	ip_adapter_image=ip_adapter_image,
	negative_prompt=<span class="hljs-string">"monochrome, lowres, bad anatomy, worst quality, low quality"</span>,
	num_inference_steps=<span class="hljs-number">50</span>,
	generator=generator,
	).images[<span class="hljs-number">0</span>]
	image<!-- HTML_TAG_END --></pre></div> <div class="flex justify-center" data-svelte-h="svelte-kc4794"> <img src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ipa-controlnet-out.png"></div> <h3 class="relative group"><a id="style--layout-control" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#style--layout-control"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Style & layout control</span></h3> <p data-svelte-h="svelte-q7o983"><a href="https://arxiv.org/abs/2404.02733" rel="nofollow">InstantStyle</a> is a plug-and-play method on top of IP-Adapter, which disentangles style and layout from image prompt to control image generation. This way, you can generate images following only the style or layout from image prompt, with significantly improved diversity. This is achieved by only activating IP-Adapters to specific parts of the model.</p> <p data-svelte-h="svelte-9mtd92">By default IP-Adapters are inserted to all layers of the model. Use the <a href="/docs/diffusers/pr_10567/en/api/loaders/ip_adapter#diffusers.loaders.IPAdapterMixin.set_ip_adapter_scale">set_ip_adapter_scale()</a> method with a dictionary to assign scales to IP-Adapter at different layers.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> AutoPipelineForText2Image
	<span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> load_image
	<span class="hljs-keyword">import</span> torch

	pipeline = AutoPipelineForText2Image.from_pretrained(<span class="hljs-string">"stabilityai/stable-diffusion-xl-base-1.0"</span>, torch_dtype=torch.float16).to(<span class="hljs-string">"cuda"</span>)
	pipeline.load_ip_adapter(<span class="hljs-string">"h94/IP-Adapter"</span>, subfolder=<span class="hljs-string">"sdxl_models"</span>, weight_name=<span class="hljs-string">"ip-adapter_sdxl.bin"</span>)

	scale = {
	<span class="hljs-string">"down"</span>: {<span class="hljs-string">"block_2"</span>: [<span class="hljs-number">0.0</span>, <span class="hljs-number">1.0</span>]},
	<span class="hljs-string">"up"</span>: {<span class="hljs-string">"block_0"</span>: [<span class="hljs-number">0.0</span>, <span class="hljs-number">1.0</span>, <span class="hljs-number">0.0</span>]},
	}
	pipeline.set_ip_adapter_scale(scale)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-kr0aol">This will activate IP-Adapter at the second layer in the model’s down-part block 2 and up-part block 0. The former is the layer where IP-Adapter injects layout information and the latter injects style. Inserting IP-Adapter to these two layers you can generate images following both the style and layout from image prompt, but with contents more aligned to text prompt.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->style_image = load_image(<span class="hljs-string">"https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"</span>)

	generator = torch.Generator(device=<span class="hljs-string">"cpu"</span>).manual_seed(<span class="hljs-number">26</span>)
	image = pipeline(
	prompt=<span class="hljs-string">"a cat, masterpiece, best quality, high quality"</span>,
	ip_adapter_image=style_image,
	negative_prompt=<span class="hljs-string">"text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry"</span>,
	guidance_scale=<span class="hljs-number">5</span>,
	num_inference_steps=<span class="hljs-number">30</span>,
	generator=generator,
	).images[<span class="hljs-number">0</span>]
	image<!-- HTML_TAG_END --></pre></div> <div class="flex flex-row gap-4" data-svelte-h="svelte-fn5cjg"><div class="flex-1"><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"> <figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter image</figcaption></div> <div class="flex-1"><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption></div></div> <p data-svelte-h="svelte-12ccihh">In contrast, inserting IP-Adapter to all layers will often generate images that overly focus on image prompt and diminish diversity.</p> <p data-svelte-h="svelte-18cri0h">Activate IP-Adapter only in the style layer and then call the pipeline again.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->scale = {
	<span class="hljs-string">"up"</span>: {<span class="hljs-string">"block_0"</span>: [<span class="hljs-number">0.0</span>, <span class="hljs-number">1.0</span>, <span class="hljs-number">0.0</span>]},
	}
	pipeline.set_ip_adapter_scale(scale)

	generator = torch.Generator(device=<span class="hljs-string">"cpu"</span>).manual_seed(<span class="hljs-number">26</span>)
	image = pipeline(
	prompt=<span class="hljs-string">"a cat, masterpiece, best quality, high quality"</span>,
	ip_adapter_image=style_image,
	negative_prompt=<span class="hljs-string">"text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry"</span>,
	guidance_scale=<span class="hljs-number">5</span>,
	num_inference_steps=<span class="hljs-number">30</span>,
	generator=generator,
	).images[<span class="hljs-number">0</span>]
	image<!-- HTML_TAG_END --></pre></div> <div class="flex flex-row gap-4" data-svelte-h="svelte-1x2k8r5"><div class="flex-1"><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_only.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter only in style layer</figcaption></div> <div class="flex-1"><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_ip_adapter.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">IP-Adapter in all layers</figcaption></div></div> <p data-svelte-h="svelte-bmv4p">Note that you don’t have to specify all layers in the dictionary. Those not included in the dictionary will be set to scale 0 which means disable IP-Adapter by default.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/diffusers/blob/main/docs/source/en/using-diffusers/ip_adapter.md" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_inhvqu = {
	assets: "/docs/diffusers/pr_10567/en",
	base: "/docs/diffusers/pr_10567/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/diffusers/pr_10567/en/_app/immutable/entry/start.5ab964f0.js"),
	import("/docs/diffusers/pr_10567/en/_app/immutable/entry/app.d83dbfce.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 247],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 87.1 kB
Xet hash:: c2597be12e5cecdd39281460ad887ee45bd0b7c8fb8900416b74fad213dfc70b

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.