Buckets:
| <meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Unit 3: Stable Diffusion","local":"unit-3-stable-diffusion","sections":[{"title":"Start this Unit 🚀","local":"start-this-unit-","sections":[],"depth":2},{"title":"Introduction","local":"introduction","sections":[],"depth":2},{"title":"Latent Diffusion","local":"latent-diffusion","sections":[],"depth":2},{"title":"Text Conditioning","local":"text-conditioning","sections":[],"depth":2},{"title":"Classifier-free Guidance","local":"classifier-free-guidance","sections":[],"depth":2},{"title":"Other Types of Conditioning: Super-Resolution, Inpainting and Depth-to-Image","local":"other-types-of-conditioning-super-resolution-inpainting-and-depth-to-image","sections":[],"depth":2},{"title":"Fine-Tuning with DreamBooth","local":"fine-tuning-with-dreambooth","sections":[],"depth":2},{"title":"Hands-On Notebook","local":"hands-on-notebook","sections":[],"depth":2},{"title":"Project Time","local":"project-time","sections":[],"depth":2},{"title":"Some Additional Resources","local":"some-additional-resources","sections":[],"depth":2}],"depth":1}"> | |
| <link href="/docs/diffusion-course/pr_113/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload"> | |
| <link rel="modulepreload" href="/docs/diffusion-course/pr_113/en/_app/immutable/entry/start.d783b3e7.js"> | |
| <link rel="modulepreload" href="/docs/diffusion-course/pr_113/en/_app/immutable/chunks/scheduler.47c1f99a.js"> | |
| <link rel="modulepreload" href="/docs/diffusion-course/pr_113/en/_app/immutable/chunks/singletons.8d8f1267.js"> | |
| <link rel="modulepreload" href="/docs/diffusion-course/pr_113/en/_app/immutable/chunks/paths.82d718a6.js"> | |
| <link rel="modulepreload" href="/docs/diffusion-course/pr_113/en/_app/immutable/entry/app.21133b1e.js"> | |
| <link rel="modulepreload" href="/docs/diffusion-course/pr_113/en/_app/immutable/chunks/preload-helper.d7d11f96.js"> | |
| <link rel="modulepreload" href="/docs/diffusion-course/pr_113/en/_app/immutable/chunks/index.bcb71b6c.js"> | |
| <link rel="modulepreload" href="/docs/diffusion-course/pr_113/en/_app/immutable/nodes/0.c4a51760.js"> | |
| <link rel="modulepreload" href="/docs/diffusion-course/pr_113/en/_app/immutable/chunks/each.e59479a4.js"> | |
| <link rel="modulepreload" href="/docs/diffusion-course/pr_113/en/_app/immutable/nodes/12.96ffd150.js"> | |
| <link rel="modulepreload" href="/docs/diffusion-course/pr_113/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.2aa9fd83.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Unit 3: Stable Diffusion","local":"unit-3-stable-diffusion","sections":[{"title":"Start this Unit 🚀","local":"start-this-unit-","sections":[],"depth":2},{"title":"Introduction","local":"introduction","sections":[],"depth":2},{"title":"Latent Diffusion","local":"latent-diffusion","sections":[],"depth":2},{"title":"Text Conditioning","local":"text-conditioning","sections":[],"depth":2},{"title":"Classifier-free Guidance","local":"classifier-free-guidance","sections":[],"depth":2},{"title":"Other Types of Conditioning: Super-Resolution, Inpainting and Depth-to-Image","local":"other-types-of-conditioning-super-resolution-inpainting-and-depth-to-image","sections":[],"depth":2},{"title":"Fine-Tuning with DreamBooth","local":"fine-tuning-with-dreambooth","sections":[],"depth":2},{"title":"Hands-On Notebook","local":"hands-on-notebook","sections":[],"depth":2},{"title":"Project Time","local":"project-time","sections":[],"depth":2},{"title":"Some Additional Resources","local":"some-additional-resources","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="unit-3-stable-diffusion" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#unit-3-stable-diffusion"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Unit 3: Stable Diffusion</span></h1> <p data-svelte-h="svelte-1udphu6">Welcome to Unit 3 of the Hugging Face Diffusion Models Course! In this unit you will meet a powerful diffusion model called Stable Diffusion (SD) and explore what it can do.</p> <h2 class="relative group"><a id="start-this-unit-" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#start-this-unit-"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Start this Unit 🚀</span></h2> <p data-svelte-h="svelte-1hcxnu9">Here are the steps for this unit:</p> <ul data-svelte-h="svelte-1tybmmi"><li>Make sure you’ve <a href="https://huggingface.us17.list-manage.com/subscribe?u=7f57e683fa28b51bfc493d048&id=ef963b4162" rel="nofollow">signed up for this course</a> so that you can be notified when new material is released.</li> <li>Read through the material below for an overview of the key ideas of this unit.</li> <li>Check out the <a href="#hands-on-notebook"><em><strong>Stable Diffusion Introduction</strong></em> notebook</a> to see SD applied in practice to some common use-cases.</li> <li>Use the <em><strong>Dreambooth</strong></em> notebook in the <a href="https://github.com/huggingface/diffusion-models-class/tree/main/hackathon" rel="nofollow"><strong>hackathon</strong> folder</a> to fine-tune your own custom Stable Diffusion model and share it with the community for a chance to win some prizes and swag.</li> <li>(Optional) Check out the <a href="https://www.youtube.com/watch?app=desktop&v=0_BBRNYInx8" rel="nofollow"><em><strong>Stable Diffusion Deep Dive video</strong></em></a> and the accompanying <a href="https://github.com/fastai/diffusion-nbs/blob/master/Stable%20Diffusion%20Deep%20Dive.ipynb" rel="nofollow"><em><strong>notebook</strong></em></a> for a deeper exploration of the different components and how they can be adapted for different effects. This material was created for the new FastAI course, <a href="https://www.fast.ai/posts/part2-2022.html" rel="nofollow">‘Stable Diffusion from the Foundations’</a> - the whole course has been released, making this a great supplement to this class for anyone curious about building these kinds of models completely from scratch.</li></ul> <p data-svelte-h="svelte-vvpwne">📢 Don’t forget to join the <a href="https://huggingface.co/join/discord" rel="nofollow">Discord</a>, where you can discuss the material and share what you’ve made in the <code>#diffusion-models-class</code> channel.</p> <h2 class="relative group"><a id="introduction" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#introduction"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Introduction</span></h2> <p data-svelte-h="svelte-jk38te"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/diffusion-course/sd_demo_images.jpg" alt="SD example images"><br> <em>Example images generated using Stable Diffusion</em></p> <p data-svelte-h="svelte-11b8aqw">Stable Diffusion is a powerful text-conditioned latent diffusion model. Don’t worry, we’ll explain those words shortly! Its ability to create amazing images from text descriptions has made it an internet sensation. In this unit, we’re going to explore how SD works and see what other tricks it can do.</p> <h2 class="relative group"><a id="latent-diffusion" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#latent-diffusion"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Latent Diffusion</span></h2> <p data-svelte-h="svelte-1gv7ui7">As image size grows, so does the computational power required to work with those images. This is especially pronounced in an operation called self-attention, where the amount of operations grows quadratically with the number of inputs. A 128px square image has 4x as many pixels as a 64px square image, and so requires 16x (i.e. 4<sup>2</sup>) the memory and compute in a self-attention layer. This is a problem for anyone who’d like to generate high-resolution images!</p> <p data-svelte-h="svelte-e5xd70"><img src="https://github.com/CompVis/latent-diffusion/raw/main/assets/modelfigure.png" alt="latent diffusion diagram"><br> <em>Diagram from the <a href="http://arxiv.org/abs/2112.10752" rel="nofollow">Latent Diffusion paper</a></em></p> <p data-svelte-h="svelte-127lhzy">Latent diffusion helps to mitigate this issue by using a separate model called a Variational Auto-Encoder (VAE) to <strong>compress</strong> images to a smaller spatial dimension. The rationale behind this is that images tend to contain a large amount of redundant information - given enough training data, a VAE can hopefully learn to produce a much smaller representation of an input image and then reconstruct the image based on this small <strong>latent</strong> representation with a high degree of fidelity. The VAE used in SD takes in 3-channel images and produces a 4-channel latent representation with a reduction factor of 8 for each spatial dimension. That is, a 512px square input image will be compressed down to a 4x64x64 latent.</p> <p data-svelte-h="svelte-1iq7lmz">By applying the diffusion process on these <strong>latent representations</strong> rather than on full-resolution images, we can get many of the benefits that would come from using smaller images (lower memory usage, fewer layers needed in the UNet, faster generation times…) and still decode the result back to a high-resolution image once we’re ready to view the final result. This innovation dramatically lowers the cost to train and run these models.</p> <h2 class="relative group"><a id="text-conditioning" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#text-conditioning"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Text Conditioning</span></h2> <p data-svelte-h="svelte-voph2n">In Unit 2 we showed how feeding additional information to the UNet allows us to have some additional control over the types of images generated. We call this conditioning. Given a noisy version of an image, the model is tasked with predicting the denoised version <strong>based on additional clues</strong> such as a class label or, in the case of Stable Diffusion, a text description of the image. At inference time, we can feed in the description of an image we’d like to see and some pure noise as a starting point, and the model does its best to ‘denoise’ the random input into something that matches the caption.</p> <p data-svelte-h="svelte-zjxw0s"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/diffusion-course/text_encoder_noborder.png" alt="text encoder diagram"><br> <em>Diagram showing the text encoding process which transforms the input prompt into a set of text embeddings (the encoder_hidden_states) which can then be fed in as conditioning to the UNet.</em></p> <p data-svelte-h="svelte-11nk91f">For this to work, we need to create a numeric representation of the text that captures relevant information about what it describes. To do this, SD leverages a pre-trained transformer model based on something called CLIP. CLIP’s text encoder was designed to process image captions into a form that could be used to compare images and text, so it is well suited to the task of creating useful representations from image descriptions. An input prompt is first tokenized (based on a large vocabulary where each word or sub-word is assigned a specific token) and then fed through the CLIP text encoder, producing a 768-dimensional (in the case of SD 1.X) or 1024-dimensional (SD 2.X) vector for each token. To keep things consistent prompts are always padded/truncated to be 77 tokens long, and so the final representation which we use as conditioning is a tensor of shape 77x1024 per prompt.</p> <p data-svelte-h="svelte-1k7i7zn"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/diffusion-course/sd_unet_color.png" alt="conditioning diagram"></p> <p data-svelte-h="svelte-1vm4kwn">OK, so how do we actually feed this conditioning information into the UNet for it to use as it makes predictions? The answer is something called cross-attention. Scattered throughout the UNet are cross-attention layers. Each spatial location in the UNet can ‘attend’ to different tokens in the text conditioning, bringing in relevant information from the prompt. The diagram above shows how this text conditioning (as well as timestep-based conditioning) is fed in at different points. As you can see, at every level the UNet has ample opportunity to make use of this conditioning!</p> <h2 class="relative group"><a id="classifier-free-guidance" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#classifier-free-guidance"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Classifier-free Guidance</span></h2> <p data-svelte-h="svelte-yi3njo">It turns out that even with all of the effort put into making the text conditioning as useful as possible, the model still tends to default to relying mostly on the noisy input image rather than the prompt when making its predictions. In a way, this makes sense - many captions are only loosely related to their associated images and so the model learns not to rely too heavily on the descriptions! However, this is undesirable when it comes time to generate new images - if the model doesn’t follow the prompt then we may get images out that don’t relate to our description at all.</p> <p data-svelte-h="svelte-pcp29z"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/diffusion-course/cfg_example_0_1_2_10.jpeg" alt="CFG scale demo grid"><br> <em>Images generated from the prompt “An oil painting of a collie in a top hat” with CFG scale 0, 1, 2 and 10 (left to right)</em></p> <p data-svelte-h="svelte-179e1g2">To fix this, we use a trick called Classifier-Free Guidance (CGF). During training, text conditioning is sometimes kept blank, forcing the model to learn to denoise images with no text information whatsoever (unconditional generation). Then at inference time, we make two separate predictions: one with the text prompt as conditioning and one without. We can then use the difference between these two predictions to create a final combined prediction that pushes <strong>even further</strong> in the direction indicated by the text-conditioned prediction according to some scaling factor (the guidance scale), hopefully resulting in an image that better matches the prompt. The image above shows the outputs for a prompt at different guidance scales - as you can see, higher values result in images that better match the description.</p> <h2 class="relative group"><a id="other-types-of-conditioning-super-resolution-inpainting-and-depth-to-image" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#other-types-of-conditioning-super-resolution-inpainting-and-depth-to-image"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Other Types of Conditioning: Super-Resolution, Inpainting and Depth-to-Image</span></h2> <p data-svelte-h="svelte-hl77ya">It is possible to create versions of Stable Diffusion that take in additional kinds of conditioning. For example, the <a href="https://huggingface.co/stabilityai/stable-diffusion-2-depth" rel="nofollow">Depth-to-Image model</a> has extra input channels that take in-depth information about the image being denoised, and at inference time we can feed in the depth map of a target image (estimated using a separate model) to hopefully generate an image with a similar overall structure.</p> <p data-svelte-h="svelte-1k5dbh4"><img src="https://huggingface.co/stabilityai/stable-diffusion-2-depth/resolve/main/depth2image.png" alt="depth to image example"><br> <em>Depth-conditioned SD is able to generate different images with the same overall structure (example from StabilityAI)</em></p> <p data-svelte-h="svelte-frpfa7">In a similar manner, we can feed in a low-resolution image as the conditioning and have the model generate the high-resolution version (<a href="https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler" rel="nofollow">as used by the Stable Diffusion Upscaler</a>). Finally, we can feed in a mask showing a region of the image to be re-generated as part of the ‘in-painting’ task, where the non-mask regions need to stay intact while new content is generated for the masked area.</p> <h2 class="relative group"><a id="fine-tuning-with-dreambooth" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#fine-tuning-with-dreambooth"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Fine-Tuning with DreamBooth</span></h2> <p data-svelte-h="svelte-1hmf65s"><img src="https://dreambooth.github.io/DreamBooth_files/teaser_static.jpg" alt="dreambooth diagram"> <em>Image from the <a href="https://dreambooth.github.io/" rel="nofollow">dreambooth project page</a> based on the Imagen model</em></p> <p data-svelte-h="svelte-x4alwp">DreamBooth is a technique for fine-tuning a text-to-image model to ‘teach’ it a new concept, such as a specific object or style. The technique was originally developed for Google’s Imagen model but was quickly adapted to <a href="https://huggingface.co/docs/diffusers/training/dreambooth" rel="nofollow">work for stable diffusion</a>. Results can be extremely impressive (if you’ve seen anyone with an AI profile picture on social media recently the odds are high it came from a dreambooth-based service) but the technique is also sensitive to the settings used, so check out our notebook and <a href="https://huggingface.co/blog/dreambooth" rel="nofollow">this great investigation into the different training parameters</a> for some tips on getting it working as well as possible.</p> <h2 class="relative group"><a id="hands-on-notebook" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#hands-on-notebook"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Hands-On Notebook</span></h2> <table data-svelte-h="svelte-1717hno"><thead><tr><th align="left">Chapter</th> <th align="left">Colab</th> <th align="left">Kaggle</th> <th align="left">Gradient</th> <th align="left">Studio Lab</th></tr></thead> <tbody><tr><td align="left">Stable Diffusion Introduction</td> <td align="left"><a href="https://colab.research.google.com/github/huggingface/diffusion-models-class/blob/main/unit3/01_stable_diffusion_introduction.ipynb" rel="nofollow"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a></td> <td align="left"><a href="https://kaggle.com/kernels/welcome?src=https://github.com/huggingface/diffusion-models-class/blob/main/unit3/01_stable_diffusion_introduction.ipynb" rel="nofollow"><img src="https://kaggle.com/static/images/open-in-kaggle.svg" alt="Kaggle"></a></td> <td align="left"><a href="https://console.paperspace.com/github/huggingface/diffusion-models-class/blob/main/unit3/01_stable_diffusion_introduction.ipynb" rel="nofollow"><img src="https://assets.paperspace.io/img/gradient-badge.svg" alt="Gradient"></a></td> <td align="left"><a href="https://studiolab.sagemaker.aws/import/github/huggingface/diffusion-models-class/blob/main/unit3/01_stable_diffusion_introduction.ipynb" rel="nofollow"><img src="https://studiolab.sagemaker.aws/studiolab.svg" alt="Open In SageMaker Studio Lab"></a></td></tr> <tr><td align="left">DreamBooth Hackathon Notebook</td> <td align="left"><a href="https://colab.research.google.com/github/huggingface/diffusion-models-class/blob/main/hackathon/dreambooth.ipynb" rel="nofollow"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a></td> <td align="left"><a href="https://kaggle.com/kernels/welcome?src=https://github.com/huggingface/diffusion-models-class/blob/main/hackathon/dreambooth.ipynb" rel="nofollow"><img src="https://kaggle.com/static/images/open-in-kaggle.svg" alt="Kaggle"></a></td> <td align="left"><a href="https://console.paperspace.com/github/huggingface/diffusion-models-class/blob/main/hackathon/dreambooth.ipynb" rel="nofollow"><img src="https://assets.paperspace.io/img/gradient-badge.svg" alt="Gradient"></a></td> <td align="left"><a href="https://studiolab.sagemaker.aws/import/github/huggingface/diffusion-models-class/blob/main/hackathon/dreambooth.ipynb" rel="nofollow"><img src="https://studiolab.sagemaker.aws/studiolab.svg" alt="Open In SageMaker Studio Lab"></a></td></tr> <tr><td align="left">Stable Diffusion Deep Dive</td> <td align="left"><a href="https://colab.research.google.com/github/fastai/diffusion-nbs/blob/master/Stable%20Diffusion%20Deep%20Dive.ipynb" rel="nofollow"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a></td> <td align="left"><a href="https://kaggle.com/kernels/welcome?src=https://github.com/fastai/diffusion-nbs/blob/master/Stable%20Diffusion%20Deep%20Dive.ipynb" rel="nofollow"><img src="https://kaggle.com/static/images/open-in-kaggle.svg" alt="Kaggle"></a></td> <td align="left"><a href="https://console.paperspace.com/github/fastai/diffusion-nbs/blob/master/Stable%20Diffusion%20Deep%20Dive.ipynb" rel="nofollow"><img src="https://assets.paperspace.io/img/gradient-badge.svg" alt="Gradient"></a></td> <td align="left"><a href="https://studiolab.sagemaker.aws/import/github/fastai/diffusion-nbs/blob/master/Stable%20Diffusion%20Deep%20Dive.ipynb" rel="nofollow"><img src="https://studiolab.sagemaker.aws/studiolab.svg" alt="Open In SageMaker Studio Lab"></a></td></tr></tbody></table> <p data-svelte-h="svelte-1bflbgv">At this point, you know enough to get started with the accompanying notebooks! Open them in your platform of choice using the links above. DreamBooth requires quite a lot of compute power, so if you’re using Kaggle or Google Colab make sure you set the runtime type to ‘GPU’ for the best results.</p> <p data-svelte-h="svelte-1l2vkm2">The ‘Stable Diffusion Introduction’ notebook is a short introduction to stable diffusion with the 🤗 Diffusers library, stepping through some basic usage examples using pipelines to generate and modify images.</p> <p data-svelte-h="svelte-8sil09">In the DreamBooth Hackathon Notebook (in the <a href="https://github.com/huggingface/diffusion-models-class/tree/main/hackathon" rel="nofollow">hackathon folder</a>) we show how you can fine-tune SD on your own images to create a custom version of the model covering a new style or concept.</p> <p data-svelte-h="svelte-137bm2">Finally, the ‘Stable Diffusion Deep Dive’ notebook and video break down every step in a typical generation pipeline, suggesting some novel ways to modify each stage for additional creative control.</p> <h2 class="relative group"><a id="project-time" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#project-time"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Project Time</span></h2> <p data-svelte-h="svelte-196oghr">Follow the instructions in the <strong>DreamBooth</strong> notebook to train your own model for one of the specified categories. Make sure you include the example outputs in your submission so that we can choose the best models in each category! See the <a href="https://github.com/huggingface/diffusion-models-class/tree/main/hackathon" rel="nofollow">hackathon info</a> for details on prizes, GPU credits and more.</p> <h2 class="relative group"><a id="some-additional-resources" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#some-additional-resources"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Some Additional Resources</span></h2> <ul data-svelte-h="svelte-177kal2"><li><p><a href="http://arxiv.org/abs/2112.10752" rel="nofollow">High-Resolution Image Synthesis with Latent Diffusion Models</a> - The paper that introduced the approach behind Stable Diffusion.</p></li> <li><p><a href="https://openai.com/blog/clip/" rel="nofollow">CLIP</a> - CLIP learns to connect text with images and the CLIP text encoder is used to transform a text prompt into the rich numerical representation used by SD. See also, <a href="https://wandb.ai/johnowhitaker/openclip-benchmarking/reports/Exploring-OpenCLIP--VmlldzoyOTIzNzIz" rel="nofollow">this article on OpenCLIP</a> for some background on recent open-source CLIP variants (one of which is used for SD version 2).</p></li> <li><p><a href="https://arxiv.org/abs/2112.10741" rel="nofollow">GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models</a> an early paper demonstrating text conditioning and CFG.</p></li></ul> <p data-svelte-h="svelte-1wvf2ag">Found more great resources? Let us know and we’ll add them to this list.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/diffusion-models-class/blob/main/unit3/README.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p> | |
| <script> | |
| { | |
| __sveltekit_x2iol1 = { | |
| assets: "/docs/diffusion-course/pr_113/en", | |
| base: "/docs/diffusion-course/pr_113/en", | |
| env: {} | |
| }; | |
| const element = document.currentScript.parentElement; | |
| const data = [null,null]; | |
| Promise.all([ | |
| import("/docs/diffusion-course/pr_113/en/_app/immutable/entry/start.d783b3e7.js"), | |
| import("/docs/diffusion-course/pr_113/en/_app/immutable/entry/app.21133b1e.js") | |
| ]).then(([kit, app]) => { | |
| kit.start(app, element, { | |
| node_ids: [0, 12], | |
| data, | |
| form: null, | |
| error: null | |
| }); | |
| }); | |
| } | |
| </script> | |
Xet Storage Details
- Size:
- 38.2 kB
- Xet hash:
- 0cbeb48c2fd4253f196c6dc60469f2381433f1ce9ad61c6d6c12e1bb09a393c6
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.