Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / diffusers /pr_12262 /en /quicktour.html

rtrm

17 days ago

download

raw

32.9 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Quickstart","local":"quickstart","sections":[{"title":"DiffusionPipeline","local":"diffusionpipeline","sections":[],"depth":2},{"title":"LoRA","local":"lora","sections":[],"depth":2},{"title":"Quantization","local":"quantization","sections":[],"depth":2},{"title":"Optimizations","local":"optimizations","sections":[{"title":"Memory usage","local":"memory-usage","sections":[],"depth":3},{"title":"Inference speed","local":"inference-speed","sections":[],"depth":3}],"depth":2}],"depth":1}">
	<link href="/docs/diffusers/pr_12262/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/diffusers/pr_12262/en/_app/immutable/entry/start.1f2e0047.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_12262/en/_app/immutable/chunks/scheduler.8c3d61f6.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_12262/en/_app/immutable/chunks/singletons.0ac1f958.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_12262/en/_app/immutable/chunks/index.0997d446.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_12262/en/_app/immutable/chunks/paths.f1dfa57d.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_12262/en/_app/immutable/entry/app.29980aee.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_12262/en/_app/immutable/chunks/index.da70eac4.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_12262/en/_app/immutable/nodes/0.c56697ec.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_12262/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_12262/en/_app/immutable/nodes/267.cb72844a.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_12262/en/_app/immutable/chunks/Tip.1d9b8c37.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_12262/en/_app/immutable/chunks/CodeBlock.a9c4becf.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_12262/en/_app/immutable/chunks/getInferenceSnippets.676f6ee5.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_12262/en/_app/immutable/chunks/HfOption.6c3b4e77.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Quickstart","local":"quickstart","sections":[{"title":"DiffusionPipeline","local":"diffusionpipeline","sections":[],"depth":2},{"title":"LoRA","local":"lora","sections":[],"depth":2},{"title":"Quantization","local":"quantization","sections":[],"depth":2},{"title":"Optimizations","local":"optimizations","sections":[{"title":"Memory usage","local":"memory-usage","sections":[],"depth":3},{"title":"Inference speed","local":"inference-speed","sections":[],"depth":3}],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="quickstart" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#quickstart"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Quickstart</span></h1> <p data-svelte-h="svelte-56xior">Diffusers is a library for developers and researchers that provides an easy inference API for generating images, videos and audio, as well as the building blocks for implementing new workflows.</p> <p data-svelte-h="svelte-st2d2w">Diffusers provides many optimizations out-of-the-box that makes it possible to load and run large models on setups with limited memory or to accelerate inference.</p> <p data-svelte-h="svelte-ty6z9h">This Quickstart will give you an overview of Diffusers and get you up and generating quickly.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-83k6u8">Before you begin, make sure you have a Hugging Face <a href="https://huggingface.co/join" rel="nofollow">account</a> in order to use gated models like <a href="https://huggingface.co/black-forest-labs/FLUX.1-dev" rel="nofollow">Flux</a>.</p></div> <p data-svelte-h="svelte-1mbmwu">Follow the <a href="./installation">Installation</a> guide to install Diffusers if it’s not already installed.</p> <h2 class="relative group"><a id="diffusionpipeline" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#diffusionpipeline"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>DiffusionPipeline</span></h2> <p data-svelte-h="svelte-1uey2wy">A diffusion model combines multiple components to generate outputs in any modality based on an input, such as a text description, image or both.</p> <p data-svelte-h="svelte-1p4ihn">For a standard text-to-image model:</p> <ol data-svelte-h="svelte-107s2nd"><li><p>A text encoder turns a prompt into embeddings that guide the denoising process. Some models have more than one text encoder.</p></li> <li><p>A scheduler contains the algorithmic specifics for gradually denoising initial random noise into clean outputs. Different schedulers affect generation speed and quality.</p></li> <li><p>A UNet or diffusion transformer (DiT) is the workhorse of a diffusion model.</p> <p>At each step, it performs the denoising predictions, such as how much noise to remove or the general direction in which to steer the noise to generate better quality outputs.</p> <p>The UNet or DiT repeats this loop for a set amount of steps to generate the final output.</p></li> <li><p>A variational autoencoder (VAE) encodes and decodes pixels to a spatially compressed latent-space. <em>Latents</em> are compressed representations of an image and are more efficient to work with. The UNet or DiT operates on latents, and the clean latents at the end are decoded back into images.</p></li></ol> <p data-svelte-h="svelte-37xznr">The <a href="/docs/diffusers/pr_12262/en/api/pipelines/overview#diffusers.DiffusionPipeline">DiffusionPipeline</a> packages all these components into a single class for inference. There are several arguments in <code>__call__()</code> you can change, such as <code>num_inference_steps</code>, that affect the diffusion process. Try different values and arguments to see how they change generation quality or speed.</p> <p data-svelte-h="svelte-16g978h">Load a model with <a href="/docs/diffusers/pr_12262/en/api/pipelines/overview#diffusers.DiffusionPipeline.from_pretrained">from_pretrained()</a> and describe what you’d like to generate. The example below uses the default argument values.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">text-to-image </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">text-to-video </div></div> <div class="language-select"><p data-svelte-h="svelte-1spi3an">Use <code>.images[0]</code> to access the generated image output.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> DiffusionPipeline

	pipeline = DiffusionPipeline.from_pretrained(
	<span class="hljs-string">"Qwen/Qwen-Image"</span>, torch_dtype=torch.bfloat16, device_map=<span class="hljs-string">"cuda"</span>
	)

	prompt = <span class="hljs-string">"""
	cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California
	highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain
	"""</span>
	pipeline(prompt).images[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> </div> <h2 class="relative group"><a id="lora" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#lora"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>LoRA</span></h2> <p data-svelte-h="svelte-jua4gq">Adapters insert a small number of trainable parameters to the original base model. Only the inserted parameters are fine-tuned while the rest of the model weights remain frozen. This makes it fast and cheap to fine-tune a model on a new style. Among adapters, <a href="./tutorials/using_peft_for_inference">LoRA’s</a> are the most popular.</p> <p data-svelte-h="svelte-1lek080">Add a LoRA to a pipeline with the <a href="/docs/diffusers/pr_12262/en/api/loaders/lora#diffusers.loaders.QwenImageLoraLoaderMixin.load_lora_weights">load_lora_weights()</a> method. Some LoRA’s require a special word to trigger it, such as <code>Realism</code>, in the example below. Check a LoRA’s model card to see if it requires a trigger word.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> DiffusionPipeline

	pipeline = DiffusionPipeline.from_pretrained(
	<span class="hljs-string">"Qwen/Qwen-Image"</span>, torch_dtype=torch.bfloat16, device_map=<span class="hljs-string">"cuda"</span>
	)
	pipeline.load_lora_weights(
	<span class="hljs-string">"flymy-ai/qwen-image-realism-lora"</span>,
	)

	prompt = <span class="hljs-string">"""
	super Realism cinematic film still of a cat sipping a margarita in a pool in Palm Springs in the style of umempart, California
	highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain
	"""</span>
	pipeline(prompt).images[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ghzmtu">Check out the <a href="./tutorials/using_peft_for_inference">LoRA</a> docs or Adapters section to learn more.</p> <h2 class="relative group"><a id="quantization" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#quantization"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Quantization</span></h2> <p data-svelte-h="svelte-ba18t5"><a href="./quantization/overview">Quantization</a> stores data in fewer bits to reduce memory usage. It may also speed up inference because it takes less time to perform calculations with fewer bits.</p> <p data-svelte-h="svelte-hvktkn">Diffusers provides several quantization backends and picking one depends on your use case. For example, <a href="./quantization/bitsandbytes">bitsandbytes</a> and <a href="./quantization/torchao">torchao</a> are both simple and easy to use for inference, but torchao supports more <a href="./quantization/torchao#supported-quantization-types">quantization types</a> like fp8.</p> <p data-svelte-h="svelte-xixg6c">Configure <a href="/docs/diffusers/pr_12262/en/api/quantization#diffusers.PipelineQuantizationConfig">PipelineQuantizationConfig</a> with the backend to use, the specific arguments (refer to the <a href="./api/quantization">API</a> reference for available arguments) for that backend, and which components to quantize. The example below quantizes the model to 4-bits and only uses 14.93GB of memory.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> DiffusionPipeline
	<span class="hljs-keyword">from</span> diffusers.quantizers <span class="hljs-keyword">import</span> PipelineQuantizationConfig

	quant_config = PipelineQuantizationConfig(
	quant_backend=<span class="hljs-string">"bitsandbytes_4bit"</span>,
	quant_kwargs={<span class="hljs-string">"load_in_4bit"</span>: <span class="hljs-literal">True</span>, <span class="hljs-string">"bnb_4bit_quant_type"</span>: <span class="hljs-string">"nf4"</span>, <span class="hljs-string">"bnb_4bit_compute_dtype"</span>: torch.bfloat16},
	components_to_quantize=[<span class="hljs-string">"transformer"</span>, <span class="hljs-string">"text_encoder"</span>],
	)
	pipeline = DiffusionPipeline.from_pretrained(
	<span class="hljs-string">"Qwen/Qwen-Image"</span>,
	torch_dtype=torch.bfloat16,
	quantization_config=quant_config,
	device_map=<span class="hljs-string">"cuda"</span>
	)

	prompt = <span class="hljs-string">"""
	cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California
	highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain
	"""</span>
	pipeline(prompt).images[<span class="hljs-number">0</span>]
	<span class="hljs-built_in">print</span>(<span class="hljs-string">f"Max memory reserved: <span class="hljs-subst">{torch.cuda.max_memory_allocated() / <span class="hljs-number">1024</span>**<span class="hljs-number">3</span>:<span class="hljs-number">.2</span>f}</span> GB"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-lj8jhg">Take a look at the <a href="./quantization/overview">Quantization</a> section for more details.</p> <h2 class="relative group"><a id="optimizations" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#optimizations"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Optimizations</span></h2> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-159d0jw">Optimization is dependent on hardware specs such as memory. Use this <a href="https://huggingface.co/spaces/diffusers/optimized-diffusers-code" rel="nofollow">Space</a> to generate code examples that include all of Diffusers’ available memory and speed optimization techniques for any model you’re using.</p></div> <p data-svelte-h="svelte-17bpo15">Modern diffusion models are very large and have billions of parameters. The iterative denoising process is also computationally intensive and slow. Diffusers provides techniques for reducing memory usage and boosting inference speed. These techniques can be combined with quantization to optimize for both memory usage and inference speed.</p> <h3 class="relative group"><a id="memory-usage" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#memory-usage"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Memory usage</span></h3> <p data-svelte-h="svelte-1bcil2">The text encoders and UNet or DiT can use up as much as ~30GB of memory, exceeding the amount available on many free-tier or consumer GPUs.</p> <p data-svelte-h="svelte-1udfe87">Offloading stores weights that aren’t currently used on the CPU and only moves them to the GPU when they’re needed. There are a few offloading types and the example below uses <a href="./optimization/memory#model-offloading">model offloading</a>. This moves an entire model, like a text encoder or transformer, to the CPU when it isn’t actively being used.</p> <p data-svelte-h="svelte-1l9l5zr">Call <a href="/docs/diffusers/pr_12262/en/api/pipelines/overview#diffusers.DiffusionPipeline.enable_model_cpu_offload">enable_model_cpu_offload()</a> to activate it. By combining quantization and offloading, the following example only requires ~12.54GB of memory.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> DiffusionPipeline
	<span class="hljs-keyword">from</span> diffusers.quantizers <span class="hljs-keyword">import</span> PipelineQuantizationConfig

	quant_config = PipelineQuantizationConfig(
	quant_backend=<span class="hljs-string">"bitsandbytes_4bit"</span>,
	quant_kwargs={<span class="hljs-string">"load_in_4bit"</span>: <span class="hljs-literal">True</span>, <span class="hljs-string">"bnb_4bit_quant_type"</span>: <span class="hljs-string">"nf4"</span>, <span class="hljs-string">"bnb_4bit_compute_dtype"</span>: torch.bfloat16},
	components_to_quantize=[<span class="hljs-string">"transformer"</span>, <span class="hljs-string">"text_encoder"</span>],
	)
	pipeline = DiffusionPipeline.from_pretrained(
	<span class="hljs-string">"Qwen/Qwen-Image"</span>,
	torch_dtype=torch.bfloat16,
	quantization_config=quant_config,
	device_map=<span class="hljs-string">"cuda"</span>
	)
	pipeline.enable_model_cpu_offload()

	prompt = <span class="hljs-string">"""
	cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California
	highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain
	"""</span>
	pipeline(prompt).images[<span class="hljs-number">0</span>]
	<span class="hljs-built_in">print</span>(<span class="hljs-string">f"Max memory reserved: <span class="hljs-subst">{torch.cuda.max_memory_allocated() / <span class="hljs-number">1024</span>**<span class="hljs-number">3</span>:<span class="hljs-number">.2</span>f}</span> GB"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ad4p4e">Refer to the <a href="./optimization/memory">Reduce memory usage</a> docs to learn more about other memory reducing techniques.</p> <h3 class="relative group"><a id="inference-speed" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#inference-speed"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Inference speed</span></h3> <p data-svelte-h="svelte-1cvq90h">The denoising loop performs a lot of computations and can be slow. Methods like <a href="./optimization/fp16#torchcompile">torch.compile</a> increases inference speed by compiling the computations into an optimized kernel. Compilation is slow for the first generation but successive generations should be much faster.</p> <p data-svelte-h="svelte-1teo5ea">The example below uses <a href="./optimization/fp16#regional-compilation">regional compilation</a> to only compile small regions of a model. It reduces cold-start latency while also providing a runtime speed up.</p> <p data-svelte-h="svelte-1n8teh">Call <a href="/docs/diffusers/pr_12262/en/api/models/overview#diffusers.ModelMixin.compile_repeated_blocks">compile_repeated_blocks()</a> on the model to activate it.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> DiffusionPipeline

	pipeline = DiffusionPipeline.from_pretrained(
	<span class="hljs-string">"Qwen/Qwen-Image"</span>, torch_dtype=torch.bfloat16, device_map=<span class="hljs-string">"cuda"</span>
	)

	pipeline.transformer.compile_repeated_blocks(
	fullgraph=<span class="hljs-literal">True</span>,
	)
	prompt = <span class="hljs-string">"""
	cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California
	highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain
	"""</span>
	pipeline(prompt).images[<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1f7t84p">Check out the <a href="./optimization/fp16">Accelerate inference</a> or <a href="./optimization/cache">Caching</a> docs for more methods that speed up inference.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/diffusers/blob/main/docs/source/en/quicktour.md" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_xpwf7g = {
	assets: "/docs/diffusers/pr_12262/en",
	base: "/docs/diffusers/pr_12262/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/diffusers/pr_12262/en/_app/immutable/entry/start.1f2e0047.js"),
	import("/docs/diffusers/pr_12262/en/_app/immutable/entry/app.29980aee.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 267],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 32.9 kB
Xet hash:: 1a95234c739b0380430418ffc1f63308af049a91beea7c53e80b9971ecbffef3

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.