Buckets:

hf-doc-build
/

doc-dev

hf-doc-build/doc-dev / diffusers /pr_10083 /en /using-diffusers /write_own_pipeline.html

64.1 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Understanding pipelines, models and schedulers","local":"understanding-pipelines-models-and-schedulers","sections":[{"title":"Deconstruct a basic pipeline","local":"deconstruct-a-basic-pipeline","sections":[],"depth":2},{"title":"Deconstruct the Stable Diffusion pipeline","local":"deconstruct-the-stable-diffusion-pipeline","sections":[{"title":"Create text embeddings","local":"create-text-embeddings","sections":[],"depth":3},{"title":"Create random noise","local":"create-random-noise","sections":[],"depth":3},{"title":"Denoise the image","local":"denoise-the-image","sections":[],"depth":3},{"title":"Decode the image","local":"decode-the-image","sections":[],"depth":3}],"depth":2},{"title":"Next steps","local":"next-steps","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/diffusers/pr_10083/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/diffusers/pr_10083/en/_app/immutable/entry/start.3ed1a0f4.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_10083/en/_app/immutable/chunks/scheduler.8c3d61f6.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_10083/en/_app/immutable/chunks/singletons.da8f2a2c.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_10083/en/_app/immutable/chunks/index.0997d446.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_10083/en/_app/immutable/chunks/paths.e585b256.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_10083/en/_app/immutable/entry/app.f0e18a17.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_10083/en/_app/immutable/chunks/index.da70eac4.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_10083/en/_app/immutable/nodes/0.0db17d3a.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_10083/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_10083/en/_app/immutable/nodes/255.8a0b11d3.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_10083/en/_app/immutable/chunks/Tip.1d9b8c37.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_10083/en/_app/immutable/chunks/CodeBlock.00a903b3.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_10083/en/_app/immutable/chunks/DocNotebookDropdown.02900f6b.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_10083/en/_app/immutable/chunks/globals.7f7f1b26.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_10083/en/_app/immutable/chunks/EditOnGithub.1e64e623.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Understanding pipelines, models and schedulers","local":"understanding-pipelines-models-and-schedulers","sections":[{"title":"Deconstruct a basic pipeline","local":"deconstruct-a-basic-pipeline","sections":[],"depth":2},{"title":"Deconstruct the Stable Diffusion pipeline","local":"deconstruct-the-stable-diffusion-pipeline","sections":[{"title":"Create text embeddings","local":"create-text-embeddings","sections":[],"depth":3},{"title":"Create random noise","local":"create-random-noise","sections":[],"depth":3},{"title":"Denoise the image","local":"denoise-the-image","sections":[],"depth":3},{"title":"Decode the image","local":"decode-the-image","sections":[],"depth":3}],"depth":2},{"title":"Next steps","local":"next-steps","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="understanding-pipelines-models-and-schedulers" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#understanding-pipelines-models-and-schedulers"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Understanding pipelines, models and schedulers</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0"> <div class="relative colab-dropdown "> <button class=" " type="button"> <img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"> </button> </div> <div class="relative colab-dropdown "> <button class=" " type="button"> <img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"> </button> </div></div> <p data-svelte-h="svelte-1wr97qy">🧨 Diffusers is designed to be a user-friendly and flexible toolbox for building diffusion systems tailored to your use-case. At the core of the toolbox are models and schedulers. While the <a href="/docs/diffusers/pr_10083/en/api/pipelines/overview#diffusers.DiffusionPipeline">DiffusionPipeline</a> bundles these components together for convenience, you can also unbundle the pipeline and use the models and schedulers separately to create new diffusion systems.</p> <p data-svelte-h="svelte-5khcjg">In this tutorial, you’ll learn how to use models and schedulers to assemble a diffusion system for inference, starting with a basic pipeline and then progressing to the Stable Diffusion pipeline.</p> <h2 class="relative group"><a id="deconstruct-a-basic-pipeline" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deconstruct-a-basic-pipeline"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Deconstruct a basic pipeline</span></h2> <p data-svelte-h="svelte-1o4pmop">A pipeline is a quick and easy way to run a model for inference, requiring no more than four lines of code to generate an image:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> DDPMPipeline

	<span class="hljs-meta">>>> </span>ddpm = DDPMPipeline.from_pretrained(<span class="hljs-string">"google/ddpm-cat-256"</span>, use_safetensors=<span class="hljs-literal">True</span>).to(<span class="hljs-string">"cuda"</span>)
	<span class="hljs-meta">>>> </span>image = ddpm(num_inference_steps=<span class="hljs-number">25</span>).images[<span class="hljs-number">0</span>]
	<span class="hljs-meta">>>> </span>image<!-- HTML_TAG_END --></pre></div> <div class="flex justify-center" data-svelte-h="svelte-ej6f4c"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ddpm-cat.png" alt="Image of cat created from DDPMPipeline"></div> <p data-svelte-h="svelte-7b411q">That was super easy, but how did the pipeline do that? Let’s breakdown the pipeline and take a look at what’s happening under the hood.</p> <p data-svelte-h="svelte-1lbvhlp">In the example above, the pipeline contains a <a href="/docs/diffusers/pr_10083/en/api/models/unet2d#diffusers.UNet2DModel">UNet2DModel</a> model and a <a href="/docs/diffusers/pr_10083/en/api/schedulers/ddpm#diffusers.DDPMScheduler">DDPMScheduler</a>. The pipeline denoises an image by taking random noise the size of the desired output and passing it through the model several times. At each timestep, the model predicts the <em>noise residual</em> and the scheduler uses it to predict a less noisy image. The pipeline repeats this process until it reaches the end of the specified number of inference steps.</p> <p data-svelte-h="svelte-mwr50">To recreate the pipeline with the model and scheduler separately, let’s write our own denoising process.</p> <ol data-svelte-h="svelte-g7uo25"><li>Load the model and scheduler:</li></ol> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> DDPMScheduler, UNet2DModel

	<span class="hljs-meta">>>> </span>scheduler = DDPMScheduler.from_pretrained(<span class="hljs-string">"google/ddpm-cat-256"</span>)
	<span class="hljs-meta">>>> </span>model = UNet2DModel.from_pretrained(<span class="hljs-string">"google/ddpm-cat-256"</span>, use_safetensors=<span class="hljs-literal">True</span>).to(<span class="hljs-string">"cuda"</span>)<!-- HTML_TAG_END --></pre></div> <ol start="2" data-svelte-h="svelte-o7kbas"><li>Set the number of timesteps to run the denoising process for:</li></ol> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>scheduler.set_timesteps(<span class="hljs-number">50</span>)<!-- HTML_TAG_END --></pre></div> <ol start="3" data-svelte-h="svelte-1jnsvh9"><li>Setting the scheduler timesteps creates a tensor with evenly spaced elements in it, 50 in this example. Each element corresponds to a timestep at which the model denoises an image. When you create the denoising loop later, you’ll iterate over this tensor to denoise an image:</li></ol> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>scheduler.timesteps
	tensor([<span class="hljs-number">980</span>, <span class="hljs-number">960</span>, <span class="hljs-number">940</span>, <span class="hljs-number">920</span>, <span class="hljs-number">900</span>, <span class="hljs-number">880</span>, <span class="hljs-number">860</span>, <span class="hljs-number">840</span>, <span class="hljs-number">820</span>, <span class="hljs-number">800</span>, <span class="hljs-number">780</span>, <span class="hljs-number">760</span>, <span class="hljs-number">740</span>, <span class="hljs-number">720</span>,
	<span class="hljs-number">700</span>, <span class="hljs-number">680</span>, <span class="hljs-number">660</span>, <span class="hljs-number">640</span>, <span class="hljs-number">620</span>, <span class="hljs-number">600</span>, <span class="hljs-number">580</span>, <span class="hljs-number">560</span>, <span class="hljs-number">540</span>, <span class="hljs-number">520</span>, <span class="hljs-number">500</span>, <span class="hljs-number">480</span>, <span class="hljs-number">460</span>, <span class="hljs-number">440</span>,
	<span class="hljs-number">420</span>, <span class="hljs-number">400</span>, <span class="hljs-number">380</span>, <span class="hljs-number">360</span>, <span class="hljs-number">340</span>, <span class="hljs-number">320</span>, <span class="hljs-number">300</span>, <span class="hljs-number">280</span>, <span class="hljs-number">260</span>, <span class="hljs-number">240</span>, <span class="hljs-number">220</span>, <span class="hljs-number">200</span>, <span class="hljs-number">180</span>, <span class="hljs-number">160</span>,
	<span class="hljs-number">140</span>, <span class="hljs-number">120</span>, <span class="hljs-number">100</span>, <span class="hljs-number">80</span>, <span class="hljs-number">60</span>, <span class="hljs-number">40</span>, <span class="hljs-number">20</span>, <span class="hljs-number">0</span>])<!-- HTML_TAG_END --></pre></div> <ol start="4" data-svelte-h="svelte-1hrlquj"><li>Create some random noise with the same shape as the desired output:</li></ol> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">import</span> torch

	<span class="hljs-meta">>>> </span>sample_size = model.config.sample_size
	<span class="hljs-meta">>>> </span>noise = torch.randn((<span class="hljs-number">1</span>, <span class="hljs-number">3</span>, sample_size, sample_size), device=<span class="hljs-string">"cuda"</span>)<!-- HTML_TAG_END --></pre></div> <ol start="5" data-svelte-h="svelte-1g6w4bl"><li>Now write a loop to iterate over the timesteps. At each timestep, the model does a <a href="/docs/diffusers/pr_10083/en/api/models/unet2d#diffusers.UNet2DModel.forward">UNet2DModel.forward()</a> pass and returns the noisy residual. The scheduler’s <a href="/docs/diffusers/pr_10083/en/api/schedulers/ddpm#diffusers.DDPMScheduler.step">step()</a> method takes the noisy residual, timestep, and input and it predicts the image at the previous timestep. This output becomes the next input to the model in the denoising loop, and it’ll repeat until it reaches the end of the <code>timesteps</code> array.</li></ol> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-built_in">input</span> = noise

	<span class="hljs-meta">>>> </span><span class="hljs-keyword">for</span> t <span class="hljs-keyword">in</span> scheduler.timesteps:
	<span class="hljs-meta">... </span> <span class="hljs-keyword">with</span> torch.no_grad():
	<span class="hljs-meta">... </span> noisy_residual = model(<span class="hljs-built_in">input</span>, t).sample
	<span class="hljs-meta">... </span> previous_noisy_sample = scheduler.step(noisy_residual, t, <span class="hljs-built_in">input</span>).prev_sample
	<span class="hljs-meta">... </span> <span class="hljs-built_in">input</span> = previous_noisy_sample<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-sdv3mq">This is the entire denoising process, and you can use this same pattern to write any diffusion system.</p> <ol start="6" data-svelte-h="svelte-1du7raq"><li>The last step is to convert the denoised output into an image:</li></ol> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> PIL <span class="hljs-keyword">import</span> Image
	<span class="hljs-meta">>>> </span><span class="hljs-keyword">import</span> numpy <span class="hljs-keyword">as</span> np

	<span class="hljs-meta">>>> </span>image = (<span class="hljs-built_in">input</span> / <span class="hljs-number">2</span> + <span class="hljs-number">0.5</span>).clamp(<span class="hljs-number">0</span>, <span class="hljs-number">1</span>).squeeze()
	<span class="hljs-meta">>>> </span>image = (image.permute(<span class="hljs-number">1</span>, <span class="hljs-number">2</span>, <span class="hljs-number">0</span>) * <span class="hljs-number">255</span>).<span class="hljs-built_in">round</span>().to(torch.uint8).cpu().numpy()
	<span class="hljs-meta">>>> </span>image = Image.fromarray(image)
	<span class="hljs-meta">>>> </span>image<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-t8ln8h">In the next section, you’ll put your skills to the test and breakdown the more complex Stable Diffusion pipeline. The steps are more or less the same. You’ll initialize the necessary components, and set the number of timesteps to create a <code>timestep</code> array. The <code>timestep</code> array is used in the denoising loop, and for each element in this array, the model predicts a less noisy image. The denoising loop iterates over the <code>timestep</code>’s, and at each timestep, it outputs a noisy residual and the scheduler uses it to predict a less noisy image at the previous timestep. This process is repeated until you reach the end of the <code>timestep</code> array.</p> <p data-svelte-h="svelte-tru974">Let’s try it out!</p> <h2 class="relative group"><a id="deconstruct-the-stable-diffusion-pipeline" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deconstruct-the-stable-diffusion-pipeline"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Deconstruct the Stable Diffusion pipeline</span></h2> <p data-svelte-h="svelte-hpeh95">Stable Diffusion is a text-to-image <em>latent diffusion</em> model. It is called a latent diffusion model because it works with a lower-dimensional representation of the image instead of the actual pixel space, which makes it more memory efficient. The encoder compresses the image into a smaller representation, and a decoder to convert the compressed representation back into an image. For text-to-image models, you’ll need a tokenizer and an encoder to generate text embeddings. From the previous example, you already know you need a UNet model and a scheduler.</p> <p data-svelte-h="svelte-1yb6q1d">As you can see, this is already more complex than the DDPM pipeline which only contains a UNet model. The Stable Diffusion model has three separate pretrained models.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-14e2v6n">💡 Read the <a href="https://huggingface.co/blog/stable_diffusion#how-does-stable-diffusion-work" rel="nofollow">How does Stable Diffusion work?</a> blog for more details about how the VAE, UNet, and text encoder models work.</p></div> <p data-svelte-h="svelte-1h49h89">Now that you know what you need for the Stable Diffusion pipeline, load all these components with the <a href="/docs/diffusers/pr_10083/en/api/models/overview#diffusers.ModelMixin.from_pretrained">from_pretrained()</a> method. You can find them in the pretrained <a href="https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5" rel="nofollow"><code>stable-diffusion-v1-5/stable-diffusion-v1-5</code></a> checkpoint, and each component is stored in a separate subfolder:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> PIL <span class="hljs-keyword">import</span> Image
	<span class="hljs-meta">>>> </span><span class="hljs-keyword">import</span> torch
	<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> CLIPTextModel, CLIPTokenizer
	<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> AutoencoderKL, UNet2DConditionModel, PNDMScheduler

	<span class="hljs-meta">>>> </span>vae = AutoencoderKL.from_pretrained(<span class="hljs-string">"CompVis/stable-diffusion-v1-4"</span>, subfolder=<span class="hljs-string">"vae"</span>, use_safetensors=<span class="hljs-literal">True</span>)
	<span class="hljs-meta">>>> </span>tokenizer = CLIPTokenizer.from_pretrained(<span class="hljs-string">"CompVis/stable-diffusion-v1-4"</span>, subfolder=<span class="hljs-string">"tokenizer"</span>)
	<span class="hljs-meta">>>> </span>text_encoder = CLIPTextModel.from_pretrained(
	<span class="hljs-meta">... </span> <span class="hljs-string">"CompVis/stable-diffusion-v1-4"</span>, subfolder=<span class="hljs-string">"text_encoder"</span>, use_safetensors=<span class="hljs-literal">True</span>
	<span class="hljs-meta">... </span>)
	<span class="hljs-meta">>>> </span>unet = UNet2DConditionModel.from_pretrained(
	<span class="hljs-meta">... </span> <span class="hljs-string">"CompVis/stable-diffusion-v1-4"</span>, subfolder=<span class="hljs-string">"unet"</span>, use_safetensors=<span class="hljs-literal">True</span>
	<span class="hljs-meta">... </span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-nfiouf">Instead of the default <a href="/docs/diffusers/pr_10083/en/api/schedulers/pndm#diffusers.PNDMScheduler">PNDMScheduler</a>, exchange it for the <a href="/docs/diffusers/pr_10083/en/api/schedulers/unipc#diffusers.UniPCMultistepScheduler">UniPCMultistepScheduler</a> to see how easy it is to plug a different scheduler in:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> UniPCMultistepScheduler

	<span class="hljs-meta">>>> </span>scheduler = UniPCMultistepScheduler.from_pretrained(<span class="hljs-string">"CompVis/stable-diffusion-v1-4"</span>, subfolder=<span class="hljs-string">"scheduler"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1tr4efr">To speed up inference, move the models to a GPU since, unlike the scheduler, they have trainable weights:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>torch_device = <span class="hljs-string">"cuda"</span>
	<span class="hljs-meta">>>> </span>vae.to(torch_device)
	<span class="hljs-meta">>>> </span>text_encoder.to(torch_device)
	<span class="hljs-meta">>>> </span>unet.to(torch_device)<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="create-text-embeddings" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#create-text-embeddings"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Create text embeddings</span></h3> <p data-svelte-h="svelte-ltx5oa">The next step is to tokenize the text to generate embeddings. The text is used to condition the UNet model and steer the diffusion process towards something that resembles the input prompt.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-vswzt8">💡 The <code>guidance_scale</code> parameter determines how much weight should be given to the prompt when generating an image.</p></div> <p data-svelte-h="svelte-z6gwd8">Feel free to choose any prompt you like if you want to generate something else!</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>prompt = [<span class="hljs-string">"a photograph of an astronaut riding a horse"</span>]
	<span class="hljs-meta">>>> </span>height = <span class="hljs-number">512</span> <span class="hljs-comment"># default height of Stable Diffusion</span>
	<span class="hljs-meta">>>> </span>width = <span class="hljs-number">512</span> <span class="hljs-comment"># default width of Stable Diffusion</span>
	<span class="hljs-meta">>>> </span>num_inference_steps = <span class="hljs-number">25</span> <span class="hljs-comment"># Number of denoising steps</span>
	<span class="hljs-meta">>>> </span>guidance_scale = <span class="hljs-number">7.5</span> <span class="hljs-comment"># Scale for classifier-free guidance</span>
	<span class="hljs-meta">>>> </span>generator = torch.manual_seed(<span class="hljs-number">0</span>) <span class="hljs-comment"># Seed generator to create the initial latent noise</span>
	<span class="hljs-meta">>>> </span>batch_size = <span class="hljs-built_in">len</span>(prompt)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-q7skg3">Tokenize the text and generate the embeddings from the prompt:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>text_input = tokenizer(
	<span class="hljs-meta">... </span> prompt, padding=<span class="hljs-string">"max_length"</span>, max_length=tokenizer.model_max_length, truncation=<span class="hljs-literal">True</span>, return_tensors=<span class="hljs-string">"pt"</span>
	<span class="hljs-meta">... </span>)

	<span class="hljs-meta">>>> </span><span class="hljs-keyword">with</span> torch.no_grad():
	<span class="hljs-meta">... </span> text_embeddings = text_encoder(text_input.input_ids.to(torch_device))[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-e64ab9">You’ll also need to generate the <em>unconditional text embeddings</em> which are the embeddings for the padding token. These need to have the same shape (<code>batch_size</code> and <code>seq_length</code>) as the conditional <code>text_embeddings</code>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>max_length = text_input.input_ids.shape[-<span class="hljs-number">1</span>]
	<span class="hljs-meta">>>> </span>uncond_input = tokenizer([<span class="hljs-string">""</span>] * batch_size, padding=<span class="hljs-string">"max_length"</span>, max_length=max_length, return_tensors=<span class="hljs-string">"pt"</span>)
	<span class="hljs-meta">>>> </span>uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-41mj23">Let’s concatenate the conditional and unconditional embeddings into a batch to avoid doing two forward passes:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>text_embeddings = torch.cat([uncond_embeddings, text_embeddings])<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="create-random-noise" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#create-random-noise"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Create random noise</span></h3> <p data-svelte-h="svelte-1gyrog2">Next, generate some initial random noise as a starting point for the diffusion process. This is the latent representation of the image, and it’ll be gradually denoised. At this point, the <code>latent</code> image is smaller than the final image size but that’s okay though because the model will transform it into the final 512x512 image dimensions later.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1c9k596">💡 The height and width are divided by 8 because the <code>vae</code> model has 3 down-sampling layers. You can check by running the following:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-number">2</span> ** (<span class="hljs-built_in">len</span>(vae.config.block_out_channels) - <span class="hljs-number">1</span>) == <span class="hljs-number">8</span><!-- HTML_TAG_END --></pre></div></div> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>latents = torch.randn(
	<span class="hljs-meta">... </span> (batch_size, unet.config.in_channels, height // <span class="hljs-number">8</span>, width // <span class="hljs-number">8</span>),
	<span class="hljs-meta">... </span> generator=generator,
	<span class="hljs-meta">... </span> device=torch_device,
	<span class="hljs-meta">... </span>)<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="denoise-the-image" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#denoise-the-image"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Denoise the image</span></h3> <p data-svelte-h="svelte-1crc9ob">Start by scaling the input with the initial noise distribution, <em>sigma</em>, the noise scale value, which is required for improved schedulers like <a href="/docs/diffusers/pr_10083/en/api/schedulers/unipc#diffusers.UniPCMultistepScheduler">UniPCMultistepScheduler</a>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>latents = latents * scheduler.init_noise_sigma<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ul1wgx">The last step is to create the denoising loop that’ll progressively transform the pure noise in <code>latents</code> to an image described by your prompt. Remember, the denoising loop needs to do three things:</p> <ol data-svelte-h="svelte-ceuo6w"><li>Set the scheduler’s timesteps to use during denoising.</li> <li>Iterate over the timesteps.</li> <li>At each timestep, call the UNet model to predict the noise residual and pass it to the scheduler to compute the previous noisy sample.</li></ol> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> tqdm.auto <span class="hljs-keyword">import</span> tqdm

	<span class="hljs-meta">>>> </span>scheduler.set_timesteps(num_inference_steps)

	<span class="hljs-meta">>>> </span><span class="hljs-keyword">for</span> t <span class="hljs-keyword">in</span> tqdm(scheduler.timesteps):
	<span class="hljs-meta">... </span> <span class="hljs-comment"># expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.</span>
	<span class="hljs-meta">... </span> latent_model_input = torch.cat([latents] * <span class="hljs-number">2</span>)

	<span class="hljs-meta">... </span> latent_model_input = scheduler.scale_model_input(latent_model_input, timestep=t)

	<span class="hljs-meta">... </span> <span class="hljs-comment"># predict the noise residual</span>
	<span class="hljs-meta">... </span> <span class="hljs-keyword">with</span> torch.no_grad():
	<span class="hljs-meta">... </span> noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample

	<span class="hljs-meta">... </span> <span class="hljs-comment"># perform guidance</span>
	<span class="hljs-meta">... </span> noise_pred_uncond, noise_pred_text = noise_pred.chunk(<span class="hljs-number">2</span>)
	<span class="hljs-meta">... </span> noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

	<span class="hljs-meta">... </span> <span class="hljs-comment"># compute the previous noisy sample x_t -> x_t-1</span>
	<span class="hljs-meta">... </span> latents = scheduler.step(noise_pred, t, latents).prev_sample<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="decode-the-image" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#decode-the-image"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Decode the image</span></h3> <p data-svelte-h="svelte-rzriwa">The final step is to use the <code>vae</code> to decode the latent representation into an image and get the decoded output with <code>sample</code>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># scale and decode the image latents with vae</span>
	latents = <span class="hljs-number">1</span> / <span class="hljs-number">0.18215</span> * latents
	<span class="hljs-keyword">with</span> torch.no_grad():
	image = vae.decode(latents).sample<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-6reqqn">Lastly, convert the image to a <code>PIL.Image</code> to see your generated image!</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>image = (image / <span class="hljs-number">2</span> + <span class="hljs-number">0.5</span>).clamp(<span class="hljs-number">0</span>, <span class="hljs-number">1</span>).squeeze()
	<span class="hljs-meta">>>> </span>image = (image.permute(<span class="hljs-number">1</span>, <span class="hljs-number">2</span>, <span class="hljs-number">0</span>) * <span class="hljs-number">255</span>).to(torch.uint8).cpu().numpy()
	<span class="hljs-meta">>>> </span>image = Image.fromarray(image)
	<span class="hljs-meta">>>> </span>image<!-- HTML_TAG_END --></pre></div> <div class="flex justify-center" data-svelte-h="svelte-1b0w6va"><img src="https://huggingface.co/blog/assets/98_stable_diffusion/stable_diffusion_k_lms.png"></div> <h2 class="relative group"><a id="next-steps" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#next-steps"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Next steps</span></h2> <p data-svelte-h="svelte-9a4wlz">From basic to complex pipelines, you’ve seen that all you really need to write your own diffusion system is a denoising loop. The loop should set the scheduler’s timesteps, iterate over them, and alternate between calling the UNet model to predict the noise residual and passing it to the scheduler to compute the previous noisy sample.</p> <p data-svelte-h="svelte-u3h7jy">This is really what 🧨 Diffusers is designed for: to make it intuitive and easy to write your own diffusion system using models and schedulers.</p> <p data-svelte-h="svelte-xe6tlx">For your next steps, feel free to:</p> <ul data-svelte-h="svelte-wmho1r"><li>Learn how to <a href="../using-diffusers/contribute_pipeline">build and contribute a pipeline</a> to 🧨 Diffusers. We can’t wait and see what you’ll come up with!</li> <li>Explore <a href="../api/pipelines/overview">existing pipelines</a> in the library, and see if you can deconstruct and build a pipeline from scratch using the models and schedulers separately.</li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/diffusers/blob/main/docs/source/en/using-diffusers/write_own_pipeline.md" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_1p97lbw = {
	assets: "/docs/diffusers/pr_10083/en",
	base: "/docs/diffusers/pr_10083/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/diffusers/pr_10083/en/_app/immutable/entry/start.3ed1a0f4.js"),
	import("/docs/diffusers/pr_10083/en/_app/immutable/entry/app.f0e18a17.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 255],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 64.1 kB
Xet hash:: 1a0bba79f2c2f2cd265e0ea73cf93259ece9f6593d246f5ff7f52dcbd5b8ecee

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.