Buckets:

download
raw
77.8 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;ControlNet&quot;,&quot;local&quot;:&quot;controlnet&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Text-to-image&quot;,&quot;local&quot;:&quot;text-to-image&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Image-to-image&quot;,&quot;local&quot;:&quot;image-to-image&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Inpainting&quot;,&quot;local&quot;:&quot;inpainting&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Guess mode&quot;,&quot;local&quot;:&quot;guess-mode&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;ControlNet with Stable Diffusion XL&quot;,&quot;local&quot;:&quot;controlnet-with-stable-diffusion-xl&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;MultiControlNet&quot;,&quot;local&quot;:&quot;multicontrolnet&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/diffusers/pr_10567/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/diffusers/pr_10567/en/_app/immutable/entry/start.5ab964f0.js">
<link rel="modulepreload" href="/docs/diffusers/pr_10567/en/_app/immutable/chunks/scheduler.8c3d61f6.js">
<link rel="modulepreload" href="/docs/diffusers/pr_10567/en/_app/immutable/chunks/singletons.1271a703.js">
<link rel="modulepreload" href="/docs/diffusers/pr_10567/en/_app/immutable/chunks/index.0997d446.js">
<link rel="modulepreload" href="/docs/diffusers/pr_10567/en/_app/immutable/chunks/paths.af967ee5.js">
<link rel="modulepreload" href="/docs/diffusers/pr_10567/en/_app/immutable/entry/app.d83dbfce.js">
<link rel="modulepreload" href="/docs/diffusers/pr_10567/en/_app/immutable/chunks/index.da70eac4.js">
<link rel="modulepreload" href="/docs/diffusers/pr_10567/en/_app/immutable/nodes/0.bb4a0671.js">
<link rel="modulepreload" href="/docs/diffusers/pr_10567/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/diffusers/pr_10567/en/_app/immutable/nodes/237.6ffa6d8e.js">
<link rel="modulepreload" href="/docs/diffusers/pr_10567/en/_app/immutable/chunks/Tip.1d9b8c37.js">
<link rel="modulepreload" href="/docs/diffusers/pr_10567/en/_app/immutable/chunks/CodeBlock.00a903b3.js">
<link rel="modulepreload" href="/docs/diffusers/pr_10567/en/_app/immutable/chunks/EditOnGithub.1e64e623.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;ControlNet&quot;,&quot;local&quot;:&quot;controlnet&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Text-to-image&quot;,&quot;local&quot;:&quot;text-to-image&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Image-to-image&quot;,&quot;local&quot;:&quot;image-to-image&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Inpainting&quot;,&quot;local&quot;:&quot;inpainting&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Guess mode&quot;,&quot;local&quot;:&quot;guess-mode&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;ControlNet with Stable Diffusion XL&quot;,&quot;local&quot;:&quot;controlnet-with-stable-diffusion-xl&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;MultiControlNet&quot;,&quot;local&quot;:&quot;multicontrolnet&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="controlnet" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#controlnet"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ControlNet</span></h1> <p data-svelte-h="svelte-1ersfq7">ControlNet is a type of model for controlling image diffusion models by conditioning the model with an additional input image. There are many types of conditioning inputs (canny edge, user sketching, human pose, depth, and more) you can use to control a diffusion model. This is hugely useful because it affords you greater control over image generation, making it easier to generate specific images without experimenting with different text prompts or denoising values as much.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1scaly9">Check out Section 3.5 of the <a href="https://huggingface.co/papers/2302.05543" rel="nofollow">ControlNet</a> paper v1 for a list of ControlNet implementations on various conditioning inputs. You can find the official Stable Diffusion ControlNet conditioned models on <a href="https://huggingface.co/lllyasviel" rel="nofollow">lllyasviel</a>’s Hub profile, and more <a href="https://huggingface.co/models?other=stable-diffusion&other=controlnet" rel="nofollow">community-trained</a> ones on the Hub.</p> <p data-svelte-h="svelte-2pe81s">For Stable Diffusion XL (SDXL) ControlNet models, you can find them on the 🤗 <a href="https://huggingface.co/diffusers" rel="nofollow">Diffusers</a> Hub organization, or you can browse <a href="https://huggingface.co/models?other=stable-diffusion-xl&other=controlnet" rel="nofollow">community-trained</a> ones on the Hub.</p></div> <p data-svelte-h="svelte-r7ehkq">A ControlNet model has two sets of weights (or blocks) connected by a zero-convolution layer:</p> <ul data-svelte-h="svelte-1jzb95l"><li>a <em>locked copy</em> keeps everything a large pretrained diffusion model has learned</li> <li>a <em>trainable copy</em> is trained on the additional conditioning input</li></ul> <p data-svelte-h="svelte-10jstxp">Since the locked copy preserves the pretrained model, training and implementing a ControlNet on a new conditioning input is as fast as finetuning any other model because you aren’t training the model from scratch.</p> <p data-svelte-h="svelte-1rkzg2g">This guide will show you how to use ControlNet for text-to-image, image-to-image, inpainting, and more! There are many types of ControlNet conditioning inputs to choose from, but in this guide we’ll only focus on several of them. Feel free to experiment with other conditioning inputs!</p> <p data-svelte-h="svelte-cwruts">Before you begin, make sure you have the following libraries installed:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># uncomment to install the necessary libraries in Colab</span>
<span class="hljs-comment">#!pip install -q diffusers transformers accelerate opencv-python</span><!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="text-to-image" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#text-to-image"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Text-to-image</span></h2> <p data-svelte-h="svelte-1cb54q2">For text-to-image, you normally pass a text prompt to the model. But with ControlNet, you can specify an additional conditioning input. Let’s condition the model with a canny image, a white outline of an image on a black background. This way, the ControlNet can use the canny image as a control to guide the model to generate an image with the same outline.</p> <p data-svelte-h="svelte-9znxu8">Load an image and use the <a href="https://github.com/opencv/opencv-python" rel="nofollow">opencv-python</a> library to extract the canny image:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> load_image, make_image_grid
<span class="hljs-keyword">from</span> PIL <span class="hljs-keyword">import</span> Image
<span class="hljs-keyword">import</span> cv2
<span class="hljs-keyword">import</span> numpy <span class="hljs-keyword">as</span> np
original_image = load_image(
<span class="hljs-string">&quot;https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png&quot;</span>
)
image = np.array(original_image)
low_threshold = <span class="hljs-number">100</span>
high_threshold = <span class="hljs-number">200</span>
image = cv2.Canny(image, low_threshold, high_threshold)
image = image[:, :, <span class="hljs-literal">None</span>]
image = np.concatenate([image, image, image], axis=<span class="hljs-number">2</span>)
canny_image = Image.fromarray(image)<!-- HTML_TAG_END --></pre></div> <div class="flex gap-4" data-svelte-h="svelte-1jr9xr4"><div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption></div> <div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/vermeer_canny_edged.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">canny image</figcaption></div></div> <p data-svelte-h="svelte-1w5hdl6">Next, load a ControlNet model conditioned on canny edge detection and pass it to the <a href="/docs/diffusers/pr_10567/en/api/pipelines/controlnet#diffusers.StableDiffusionControlNetPipeline">StableDiffusionControlNetPipeline</a>. Use the faster <a href="/docs/diffusers/pr_10567/en/api/schedulers/unipc#diffusers.UniPCMultistepScheduler">UniPCMultistepScheduler</a> and enable model offloading to speed up inference and reduce memory usage.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
<span class="hljs-keyword">import</span> torch
controlnet = ControlNetModel.from_pretrained(<span class="hljs-string">&quot;lllyasviel/sd-controlnet-canny&quot;</span>, torch_dtype=torch.float16, use_safetensors=<span class="hljs-literal">True</span>)
pipe = StableDiffusionControlNetPipeline.from_pretrained(
<span class="hljs-string">&quot;stable-diffusion-v1-5/stable-diffusion-v1-5&quot;</span>, controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=<span class="hljs-literal">True</span>
)
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-17x9oax">Now pass your prompt and canny image to the pipeline:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->output = pipe(
<span class="hljs-string">&quot;the mona lisa&quot;</span>, image=canny_image
).images[<span class="hljs-number">0</span>]
make_image_grid([original_image, canny_image, output], rows=<span class="hljs-number">1</span>, cols=<span class="hljs-number">3</span>)<!-- HTML_TAG_END --></pre></div> <div class="flex justify-center" data-svelte-h="svelte-3r1z1k"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-text2img.png"></div> <h2 class="relative group"><a id="image-to-image" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#image-to-image"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Image-to-image</span></h2> <p data-svelte-h="svelte-g01jh2">For image-to-image, you’d typically pass an initial image and a prompt to the pipeline to generate a new image. With ControlNet, you can pass an additional conditioning input to guide the model. Let’s condition the model with a depth map, an image which contains spatial information. This way, the ControlNet can use the depth map as a control to guide the model to generate an image that preserves spatial information.</p> <p data-svelte-h="svelte-nfnsvw">You’ll use the <a href="/docs/diffusers/pr_10567/en/api/pipelines/controlnet#diffusers.StableDiffusionControlNetImg2ImgPipeline">StableDiffusionControlNetImg2ImgPipeline</a> for this task, which is different from the <a href="/docs/diffusers/pr_10567/en/api/pipelines/controlnet#diffusers.StableDiffusionControlNetPipeline">StableDiffusionControlNetPipeline</a> because it allows you to pass an initial image as the starting point for the image generation process.</p> <p data-svelte-h="svelte-9kkqfq">Load an image and use the <code>depth-estimation</code> <a href="https://huggingface.co/docs/transformers/main/en/main_classes/pipelines#transformers.Pipeline" rel="nofollow">Pipeline</a> from 🤗 Transformers to extract the depth map of an image:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">import</span> numpy <span class="hljs-keyword">as</span> np
<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline
<span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> load_image, make_image_grid
image = load_image(
<span class="hljs-string">&quot;https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-img2img.jpg&quot;</span>
)
<span class="hljs-keyword">def</span> <span class="hljs-title function_">get_depth_map</span>(<span class="hljs-params">image, depth_estimator</span>):
image = depth_estimator(image)[<span class="hljs-string">&quot;depth&quot;</span>]
image = np.array(image)
image = image[:, :, <span class="hljs-literal">None</span>]
image = np.concatenate([image, image, image], axis=<span class="hljs-number">2</span>)
detected_map = torch.from_numpy(image).<span class="hljs-built_in">float</span>() / <span class="hljs-number">255.0</span>
depth_map = detected_map.permute(<span class="hljs-number">2</span>, <span class="hljs-number">0</span>, <span class="hljs-number">1</span>)
<span class="hljs-keyword">return</span> depth_map
depth_estimator = pipeline(<span class="hljs-string">&quot;depth-estimation&quot;</span>)
depth_map = get_depth_map(image, depth_estimator).unsqueeze(<span class="hljs-number">0</span>).half().to(<span class="hljs-string">&quot;cuda&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1hmw649">Next, load a ControlNet model conditioned on depth maps and pass it to the <a href="/docs/diffusers/pr_10567/en/api/pipelines/controlnet#diffusers.StableDiffusionControlNetImg2ImgPipeline">StableDiffusionControlNetImg2ImgPipeline</a>. Use the faster <a href="/docs/diffusers/pr_10567/en/api/schedulers/unipc#diffusers.UniPCMultistepScheduler">UniPCMultistepScheduler</a> and enable model offloading to speed up inference and reduce memory usage.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionControlNetImg2ImgPipeline, ControlNetModel, UniPCMultistepScheduler
<span class="hljs-keyword">import</span> torch
controlnet = ControlNetModel.from_pretrained(<span class="hljs-string">&quot;lllyasviel/control_v11f1p_sd15_depth&quot;</span>, torch_dtype=torch.float16, use_safetensors=<span class="hljs-literal">True</span>)
pipe = StableDiffusionControlNetImg2ImgPipeline.from_pretrained(
<span class="hljs-string">&quot;stable-diffusion-v1-5/stable-diffusion-v1-5&quot;</span>, controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=<span class="hljs-literal">True</span>
)
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-wyr53z">Now pass your prompt, initial image, and depth map to the pipeline:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->output = pipe(
<span class="hljs-string">&quot;lego batman and robin&quot;</span>, image=image, control_image=depth_map,
).images[<span class="hljs-number">0</span>]
make_image_grid([image, output], rows=<span class="hljs-number">1</span>, cols=<span class="hljs-number">2</span>)<!-- HTML_TAG_END --></pre></div> <div class="flex gap-4" data-svelte-h="svelte-8qhhvs"><div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-img2img.jpg"> <figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption></div> <div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-img2img-2.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption></div></div> <h2 class="relative group"><a id="inpainting" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#inpainting"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Inpainting</span></h2> <p data-svelte-h="svelte-1i60bwf">For inpainting, you need an initial image, a mask image, and a prompt describing what to replace the mask with. ControlNet models allow you to add another control image to condition a model with. Let’s condition the model with an inpainting mask. This way, the ControlNet can use the inpainting mask as a control to guide the model to generate an image within the mask area.</p> <p data-svelte-h="svelte-175mm0t">Load an initial image and a mask image:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> load_image, make_image_grid
init_image = load_image(
<span class="hljs-string">&quot;https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-inpaint.jpg&quot;</span>
)
init_image = init_image.resize((<span class="hljs-number">512</span>, <span class="hljs-number">512</span>))
mask_image = load_image(
<span class="hljs-string">&quot;https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-inpaint-mask.jpg&quot;</span>
)
mask_image = mask_image.resize((<span class="hljs-number">512</span>, <span class="hljs-number">512</span>))
make_image_grid([init_image, mask_image], rows=<span class="hljs-number">1</span>, cols=<span class="hljs-number">2</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-4efl8b">Create a function to prepare the control image from the initial and mask images. This’ll create a tensor to mark the pixels in <code>init_image</code> as masked if the corresponding pixel in <code>mask_image</code> is over a certain threshold.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> numpy <span class="hljs-keyword">as</span> np
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">def</span> <span class="hljs-title function_">make_inpaint_condition</span>(<span class="hljs-params">image, image_mask</span>):
image = np.array(image.convert(<span class="hljs-string">&quot;RGB&quot;</span>)).astype(np.float32) / <span class="hljs-number">255.0</span>
image_mask = np.array(image_mask.convert(<span class="hljs-string">&quot;L&quot;</span>)).astype(np.float32) / <span class="hljs-number">255.0</span>
<span class="hljs-keyword">assert</span> image.shape[<span class="hljs-number">0</span>:<span class="hljs-number">1</span>] == image_mask.shape[<span class="hljs-number">0</span>:<span class="hljs-number">1</span>]
image[image_mask &gt; <span class="hljs-number">0.5</span>] = -<span class="hljs-number">1.0</span> <span class="hljs-comment"># set as masked pixel</span>
image = np.expand_dims(image, <span class="hljs-number">0</span>).transpose(<span class="hljs-number">0</span>, <span class="hljs-number">3</span>, <span class="hljs-number">1</span>, <span class="hljs-number">2</span>)
image = torch.from_numpy(image)
<span class="hljs-keyword">return</span> image
control_image = make_inpaint_condition(init_image, mask_image)<!-- HTML_TAG_END --></pre></div> <div class="flex gap-4" data-svelte-h="svelte-grxw51"><div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-inpaint.jpg"> <figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption></div> <div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-inpaint-mask.jpg"> <figcaption class="mt-2 text-center text-sm text-gray-500">mask image</figcaption></div></div> <p data-svelte-h="svelte-1dz4ve7">Load a ControlNet model conditioned on inpainting and pass it to the <a href="/docs/diffusers/pr_10567/en/api/pipelines/controlnet#diffusers.StableDiffusionControlNetInpaintPipeline">StableDiffusionControlNetInpaintPipeline</a>. Use the faster <a href="/docs/diffusers/pr_10567/en/api/schedulers/unipc#diffusers.UniPCMultistepScheduler">UniPCMultistepScheduler</a> and enable model offloading to speed up inference and reduce memory usage.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionControlNetInpaintPipeline, ControlNetModel, UniPCMultistepScheduler
controlnet = ControlNetModel.from_pretrained(<span class="hljs-string">&quot;lllyasviel/control_v11p_sd15_inpaint&quot;</span>, torch_dtype=torch.float16, use_safetensors=<span class="hljs-literal">True</span>)
pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
<span class="hljs-string">&quot;stable-diffusion-v1-5/stable-diffusion-v1-5&quot;</span>, controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=<span class="hljs-literal">True</span>
)
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-118t2qr">Now pass your prompt, initial image, mask image, and control image to the pipeline:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->output = pipe(
<span class="hljs-string">&quot;corgi face with large ears, detailed, pixar, animated, disney&quot;</span>,
num_inference_steps=<span class="hljs-number">20</span>,
eta=<span class="hljs-number">1.0</span>,
image=init_image,
mask_image=mask_image,
control_image=control_image,
).images[<span class="hljs-number">0</span>]
make_image_grid([init_image, mask_image, output], rows=<span class="hljs-number">1</span>, cols=<span class="hljs-number">3</span>)<!-- HTML_TAG_END --></pre></div> <div class="flex justify-center" data-svelte-h="svelte-1qk6bz9"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-inpaint-result.png"></div> <h2 class="relative group"><a id="guess-mode" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#guess-mode"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Guess mode</span></h2> <p data-svelte-h="svelte-8h99y7"><a href="https://github.com/lllyasviel/ControlNet/discussions/188" rel="nofollow">Guess mode</a> does not require supplying a prompt to a ControlNet at all! This forces the ControlNet encoder to do its best to “guess” the contents of the input control map (depth map, pose estimation, canny edge, etc.).</p> <p data-svelte-h="svelte-vnjl1j">Guess mode adjusts the scale of the output residuals from a ControlNet by a fixed ratio depending on the block depth. The shallowest <code>DownBlock</code> corresponds to 0.1, and as the blocks get deeper, the scale increases exponentially such that the scale of the <code>MidBlock</code> output becomes 1.0.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-14ez4w">Guess mode does not have any impact on prompt conditioning and you can still provide a prompt if you want.</p></div> <p data-svelte-h="svelte-vqtz65">Set <code>guess_mode=True</code> in the pipeline, and it is <a href="https://github.com/lllyasviel/ControlNet#guess-mode--non-prompt-mode" rel="nofollow">recommended</a> to set the <code>guidance_scale</code> value between 3.0 and 5.0.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionControlNetPipeline, ControlNetModel
<span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> load_image, make_image_grid
<span class="hljs-keyword">import</span> numpy <span class="hljs-keyword">as</span> np
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> PIL <span class="hljs-keyword">import</span> Image
<span class="hljs-keyword">import</span> cv2
controlnet = ControlNetModel.from_pretrained(<span class="hljs-string">&quot;lllyasviel/sd-controlnet-canny&quot;</span>, use_safetensors=<span class="hljs-literal">True</span>)
pipe = StableDiffusionControlNetPipeline.from_pretrained(<span class="hljs-string">&quot;stable-diffusion-v1-5/stable-diffusion-v1-5&quot;</span>, controlnet=controlnet, use_safetensors=<span class="hljs-literal">True</span>).to(<span class="hljs-string">&quot;cuda&quot;</span>)
original_image = load_image(<span class="hljs-string">&quot;https://huggingface.co/takuma104/controlnet_dev/resolve/main/bird_512x512.png&quot;</span>)
image = np.array(original_image)
low_threshold = <span class="hljs-number">100</span>
high_threshold = <span class="hljs-number">200</span>
image = cv2.Canny(image, low_threshold, high_threshold)
image = image[:, :, <span class="hljs-literal">None</span>]
image = np.concatenate([image, image, image], axis=<span class="hljs-number">2</span>)
canny_image = Image.fromarray(image)
image = pipe(<span class="hljs-string">&quot;&quot;</span>, image=canny_image, guess_mode=<span class="hljs-literal">True</span>, guidance_scale=<span class="hljs-number">3.0</span>).images[<span class="hljs-number">0</span>]
make_image_grid([original_image, canny_image, image], rows=<span class="hljs-number">1</span>, cols=<span class="hljs-number">3</span>)<!-- HTML_TAG_END --></pre></div> <div class="flex gap-4" data-svelte-h="svelte-1hf21gv"><div><img class="rounded-xl" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare_guess_mode/output_images/diffusers/output_bird_canny_0.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">regular mode with prompt</figcaption></div> <div><img class="rounded-xl" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare_guess_mode/output_images/diffusers/output_bird_canny_0_gm.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">guess mode without prompt</figcaption></div></div> <h2 class="relative group"><a id="controlnet-with-stable-diffusion-xl" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#controlnet-with-stable-diffusion-xl"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ControlNet with Stable Diffusion XL</span></h2> <p data-svelte-h="svelte-1jgtfzx">There aren’t too many ControlNet models compatible with Stable Diffusion XL (SDXL) at the moment, but we’ve trained two full-sized ControlNet models for SDXL conditioned on canny edge detection and depth maps. We’re also experimenting with creating smaller versions of these SDXL-compatible ControlNet models so it is easier to run on resource-constrained hardware. You can find these checkpoints on the <a href="https://huggingface.co/diffusers" rel="nofollow">🤗 Diffusers Hub organization</a>!</p> <p data-svelte-h="svelte-7i5g8g">Let’s use a SDXL ControlNet conditioned on canny images to generate an image. Start by loading an image and prepare the canny image:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLControlNetPipeline, ControlNetModel, AutoencoderKL
<span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> load_image, make_image_grid
<span class="hljs-keyword">from</span> PIL <span class="hljs-keyword">import</span> Image
<span class="hljs-keyword">import</span> cv2
<span class="hljs-keyword">import</span> numpy <span class="hljs-keyword">as</span> np
<span class="hljs-keyword">import</span> torch
original_image = load_image(
<span class="hljs-string">&quot;https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png&quot;</span>
)
image = np.array(original_image)
low_threshold = <span class="hljs-number">100</span>
high_threshold = <span class="hljs-number">200</span>
image = cv2.Canny(image, low_threshold, high_threshold)
image = image[:, :, <span class="hljs-literal">None</span>]
image = np.concatenate([image, image, image], axis=<span class="hljs-number">2</span>)
canny_image = Image.fromarray(image)
make_image_grid([original_image, canny_image], rows=<span class="hljs-number">1</span>, cols=<span class="hljs-number">2</span>)<!-- HTML_TAG_END --></pre></div> <div class="flex gap-4" data-svelte-h="svelte-j8n5na"><div><img class="rounded-xl" src="https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption></div> <div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hf-logo-canny.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">canny image</figcaption></div></div> <p data-svelte-h="svelte-77fcag">Load a SDXL ControlNet model conditioned on canny edge detection and pass it to the <a href="/docs/diffusers/pr_10567/en/api/pipelines/controlnet_sdxl#diffusers.StableDiffusionXLControlNetPipeline">StableDiffusionXLControlNetPipeline</a>. You can also enable model offloading to reduce memory usage.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->controlnet = ControlNetModel.from_pretrained(
<span class="hljs-string">&quot;diffusers/controlnet-canny-sdxl-1.0&quot;</span>,
torch_dtype=torch.float16,
use_safetensors=<span class="hljs-literal">True</span>
)
vae = AutoencoderKL.from_pretrained(<span class="hljs-string">&quot;madebyollin/sdxl-vae-fp16-fix&quot;</span>, torch_dtype=torch.float16, use_safetensors=<span class="hljs-literal">True</span>)
pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>,
controlnet=controlnet,
vae=vae,
torch_dtype=torch.float16,
use_safetensors=<span class="hljs-literal">True</span>
)
pipe.enable_model_cpu_offload()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-172pcgi">Now pass your prompt (and optionally a negative prompt if you’re using one) and canny image to the pipeline:</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-52h992">The <a href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/controlnet#diffusers.StableDiffusionControlNetPipeline.__call__.controlnet_conditioning_scale" rel="nofollow"><code>controlnet_conditioning_scale</code></a> parameter determines how much weight to assign to the conditioning inputs. A value of 0.5 is recommended for good generalization, but feel free to experiment with this number!</p></div> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->prompt = <span class="hljs-string">&quot;aerial view, a futuristic research complex in a bright foggy jungle, hard lighting&quot;</span>
negative_prompt = <span class="hljs-string">&#x27;low quality, bad quality, sketches&#x27;</span>
image = pipe(
prompt,
negative_prompt=negative_prompt,
image=canny_image,
controlnet_conditioning_scale=<span class="hljs-number">0.5</span>,
).images[<span class="hljs-number">0</span>]
make_image_grid([original_image, canny_image, image], rows=<span class="hljs-number">1</span>, cols=<span class="hljs-number">3</span>)<!-- HTML_TAG_END --></pre></div> <div class="flex justify-center" data-svelte-h="svelte-8vwtkd"><img class="rounded-xl" src="https://huggingface.co/diffusers/controlnet-canny-sdxl-1.0/resolve/main/out_hug_lab_7.png"></div> <p data-svelte-h="svelte-1ae36yk">You can use <a href="/docs/diffusers/pr_10567/en/api/pipelines/controlnet_sdxl#diffusers.StableDiffusionXLControlNetPipeline">StableDiffusionXLControlNetPipeline</a> in guess mode as well by setting the parameter to <code>True</code>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLControlNetPipeline, ControlNetModel, AutoencoderKL
<span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> load_image, make_image_grid
<span class="hljs-keyword">import</span> numpy <span class="hljs-keyword">as</span> np
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">import</span> cv2
<span class="hljs-keyword">from</span> PIL <span class="hljs-keyword">import</span> Image
prompt = <span class="hljs-string">&quot;aerial view, a futuristic research complex in a bright foggy jungle, hard lighting&quot;</span>
negative_prompt = <span class="hljs-string">&quot;low quality, bad quality, sketches&quot;</span>
original_image = load_image(
<span class="hljs-string">&quot;https://hf.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png&quot;</span>
)
controlnet = ControlNetModel.from_pretrained(
<span class="hljs-string">&quot;diffusers/controlnet-canny-sdxl-1.0&quot;</span>, torch_dtype=torch.float16, use_safetensors=<span class="hljs-literal">True</span>
)
vae = AutoencoderKL.from_pretrained(<span class="hljs-string">&quot;madebyollin/sdxl-vae-fp16-fix&quot;</span>, torch_dtype=torch.float16, use_safetensors=<span class="hljs-literal">True</span>)
pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>, controlnet=controlnet, vae=vae, torch_dtype=torch.float16, use_safetensors=<span class="hljs-literal">True</span>
)
pipe.enable_model_cpu_offload()
image = np.array(original_image)
image = cv2.Canny(image, <span class="hljs-number">100</span>, <span class="hljs-number">200</span>)
image = image[:, :, <span class="hljs-literal">None</span>]
image = np.concatenate([image, image, image], axis=<span class="hljs-number">2</span>)
canny_image = Image.fromarray(image)
image = pipe(
prompt, negative_prompt=negative_prompt, controlnet_conditioning_scale=<span class="hljs-number">0.5</span>, image=canny_image, guess_mode=<span class="hljs-literal">True</span>,
).images[<span class="hljs-number">0</span>]
make_image_grid([original_image, canny_image, image], rows=<span class="hljs-number">1</span>, cols=<span class="hljs-number">3</span>)<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-vlhoer">You can use a refiner model with <code>StableDiffusionXLControlNetPipeline</code> to improve image quality, just like you can with a regular <code>StableDiffusionXLPipeline</code>.
See the <a href="./sdxl#refine-image-quality">Refine image quality</a> section to learn how to use the refiner model.
Make sure to use <code>StableDiffusionXLControlNetPipeline</code> and pass <code>image</code> and <code>controlnet_conditioning_scale</code>.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->base = StableDiffusionXLControlNetPipeline(...)
image = base(
prompt=prompt,
controlnet_conditioning_scale=<span class="hljs-number">0.5</span>,
image=canny_image,
num_inference_steps=<span class="hljs-number">40</span>,
denoising_end=<span class="hljs-number">0.8</span>,
output_type=<span class="hljs-string">&quot;latent&quot;</span>,
).images
<span class="hljs-comment"># rest exactly as with StableDiffusionXLPipeline</span><!-- HTML_TAG_END --></pre></div></div> <h2 class="relative group"><a id="multicontrolnet" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#multicontrolnet"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>MultiControlNet</span></h2> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-ci5dpp">Replace the SDXL model with a model like <a href="https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5" rel="nofollow">stable-diffusion-v1-5/stable-diffusion-v1-5</a> to use multiple conditioning inputs with Stable Diffusion models.</p></div> <p data-svelte-h="svelte-1engdvd">You can compose multiple ControlNet conditionings from different image inputs to create a <em>MultiControlNet</em>. To get better results, it is often helpful to:</p> <ol data-svelte-h="svelte-18sf1v5"><li>mask conditionings such that they don’t overlap (for example, mask the area of a canny image where the pose conditioning is located)</li> <li>experiment with the <a href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/controlnet#diffusers.StableDiffusionControlNetPipeline.__call__.controlnet_conditioning_scale" rel="nofollow"><code>controlnet_conditioning_scale</code></a> parameter to determine how much weight to assign to each conditioning input</li></ol> <p data-svelte-h="svelte-1gpdoop">In this example, you’ll combine a canny image and a human pose estimation image to generate a new image.</p> <p data-svelte-h="svelte-1k4sivd">Prepare the canny image conditioning:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> load_image, make_image_grid
<span class="hljs-keyword">from</span> PIL <span class="hljs-keyword">import</span> Image
<span class="hljs-keyword">import</span> numpy <span class="hljs-keyword">as</span> np
<span class="hljs-keyword">import</span> cv2
original_image = load_image(
<span class="hljs-string">&quot;https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/landscape.png&quot;</span>
)
image = np.array(original_image)
low_threshold = <span class="hljs-number">100</span>
high_threshold = <span class="hljs-number">200</span>
image = cv2.Canny(image, low_threshold, high_threshold)
<span class="hljs-comment"># zero out middle columns of image where pose will be overlaid</span>
zero_start = image.shape[<span class="hljs-number">1</span>] // <span class="hljs-number">4</span>
zero_end = zero_start + image.shape[<span class="hljs-number">1</span>] // <span class="hljs-number">2</span>
image[:, zero_start:zero_end] = <span class="hljs-number">0</span>
image = image[:, :, <span class="hljs-literal">None</span>]
image = np.concatenate([image, image, image], axis=<span class="hljs-number">2</span>)
canny_image = Image.fromarray(image)
make_image_grid([original_image, canny_image], rows=<span class="hljs-number">1</span>, cols=<span class="hljs-number">2</span>)<!-- HTML_TAG_END --></pre></div> <div class="flex gap-4" data-svelte-h="svelte-1cl87j5"><div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/landscape.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption></div> <div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/controlnet/landscape_canny_masked.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">canny image</figcaption></div></div> <p data-svelte-h="svelte-2v63o">For human pose estimation, install <a href="https://github.com/patrickvonplaten/controlnet_aux" rel="nofollow">controlnet_aux</a>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># uncomment to install the necessary library in Colab</span>
<span class="hljs-comment">#!pip install -q controlnet-aux</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-bab8ey">Prepare the human pose estimation conditioning:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> controlnet_aux <span class="hljs-keyword">import</span> OpenposeDetector
openpose = OpenposeDetector.from_pretrained(<span class="hljs-string">&quot;lllyasviel/ControlNet&quot;</span>)
original_image = load_image(
<span class="hljs-string">&quot;https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/person.png&quot;</span>
)
openpose_image = openpose(original_image)
make_image_grid([original_image, openpose_image], rows=<span class="hljs-number">1</span>, cols=<span class="hljs-number">2</span>)<!-- HTML_TAG_END --></pre></div> <div class="flex gap-4" data-svelte-h="svelte-1og8gmm"><div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/person.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption></div> <div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/controlnet/person_pose.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">human pose image</figcaption></div></div> <p data-svelte-h="svelte-ehh59t">Load a list of ControlNet models that correspond to each conditioning, and pass them to the <a href="/docs/diffusers/pr_10567/en/api/pipelines/controlnet_sdxl#diffusers.StableDiffusionXLControlNetPipeline">StableDiffusionXLControlNetPipeline</a>. Use the faster <a href="/docs/diffusers/pr_10567/en/api/schedulers/unipc#diffusers.UniPCMultistepScheduler">UniPCMultistepScheduler</a> and enable model offloading to reduce memory usage.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionXLControlNetPipeline, ControlNetModel, AutoencoderKL, UniPCMultistepScheduler
<span class="hljs-keyword">import</span> torch
controlnets = [
ControlNetModel.from_pretrained(
<span class="hljs-string">&quot;thibaud/controlnet-openpose-sdxl-1.0&quot;</span>, torch_dtype=torch.float16
),
ControlNetModel.from_pretrained(
<span class="hljs-string">&quot;diffusers/controlnet-canny-sdxl-1.0&quot;</span>, torch_dtype=torch.float16, use_safetensors=<span class="hljs-literal">True</span>
),
]
vae = AutoencoderKL.from_pretrained(<span class="hljs-string">&quot;madebyollin/sdxl-vae-fp16-fix&quot;</span>, torch_dtype=torch.float16, use_safetensors=<span class="hljs-literal">True</span>)
pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-diffusion-xl-base-1.0&quot;</span>, controlnet=controlnets, vae=vae, torch_dtype=torch.float16, use_safetensors=<span class="hljs-literal">True</span>
)
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-d5aph9">Now you can pass your prompt (an optional negative prompt if you’re using one), canny image, and pose image to the pipeline:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->prompt = <span class="hljs-string">&quot;a giant standing in a fantasy landscape, best quality&quot;</span>
negative_prompt = <span class="hljs-string">&quot;monochrome, lowres, bad anatomy, worst quality, low quality&quot;</span>
generator = torch.manual_seed(<span class="hljs-number">1</span>)
images = [openpose_image.resize((<span class="hljs-number">1024</span>, <span class="hljs-number">1024</span>)), canny_image.resize((<span class="hljs-number">1024</span>, <span class="hljs-number">1024</span>))]
images = pipe(
prompt,
image=images,
num_inference_steps=<span class="hljs-number">25</span>,
generator=generator,
negative_prompt=negative_prompt,
num_images_per_prompt=<span class="hljs-number">3</span>,
controlnet_conditioning_scale=[<span class="hljs-number">1.0</span>, <span class="hljs-number">0.8</span>],
).images
make_image_grid([original_image, canny_image, openpose_image,
images[<span class="hljs-number">0</span>].resize((<span class="hljs-number">512</span>, <span class="hljs-number">512</span>)), images[<span class="hljs-number">1</span>].resize((<span class="hljs-number">512</span>, <span class="hljs-number">512</span>)), images[<span class="hljs-number">2</span>].resize((<span class="hljs-number">512</span>, <span class="hljs-number">512</span>))], rows=<span class="hljs-number">2</span>, cols=<span class="hljs-number">3</span>)<!-- HTML_TAG_END --></pre></div> <div class="flex justify-center" data-svelte-h="svelte-1qlyggw"><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/multicontrolnet.png"></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/diffusers/blob/main/docs/source/en/using-diffusers/controlnet.md" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_inhvqu = {
assets: "/docs/diffusers/pr_10567/en",
base: "/docs/diffusers/pr_10567/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/diffusers/pr_10567/en/_app/immutable/entry/start.5ab964f0.js"),
import("/docs/diffusers/pr_10567/en/_app/immutable/entry/app.d83dbfce.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 237],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
77.8 kB
·
Xet hash:
587297e8edadd308ad99a212f76656da673db47d5c08845c3a7d2fe315e7d784

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.