Buckets:

download
raw
49.7 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Video generation&quot;,&quot;local&quot;:&quot;video-generation&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Configure model parameters&quot;,&quot;local&quot;:&quot;configure-model-parameters&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Number of frames&quot;,&quot;local&quot;:&quot;number-of-frames&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Guidance scale&quot;,&quot;local&quot;:&quot;guidance-scale&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Negative prompt&quot;,&quot;local&quot;:&quot;negative-prompt&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Model-specific parameters&quot;,&quot;local&quot;:&quot;model-specific-parameters&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Control video generation&quot;,&quot;local&quot;:&quot;control-video-generation&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Text2Video-Zero&quot;,&quot;local&quot;:&quot;text2video-zero&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Optimize&quot;,&quot;local&quot;:&quot;optimize&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Quantization&quot;,&quot;local&quot;:&quot;quantization&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/diffusers/pr_11234/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/diffusers/pr_11234/en/_app/immutable/entry/start.5a617280.js">
<link rel="modulepreload" href="/docs/diffusers/pr_11234/en/_app/immutable/chunks/scheduler.8c3d61f6.js">
<link rel="modulepreload" href="/docs/diffusers/pr_11234/en/_app/immutable/chunks/singletons.0c831ec7.js">
<link rel="modulepreload" href="/docs/diffusers/pr_11234/en/_app/immutable/chunks/index.0997d446.js">
<link rel="modulepreload" href="/docs/diffusers/pr_11234/en/_app/immutable/chunks/paths.c606bb8d.js">
<link rel="modulepreload" href="/docs/diffusers/pr_11234/en/_app/immutable/entry/app.1a1de8b6.js">
<link rel="modulepreload" href="/docs/diffusers/pr_11234/en/_app/immutable/chunks/index.da70eac4.js">
<link rel="modulepreload" href="/docs/diffusers/pr_11234/en/_app/immutable/nodes/0.8544c871.js">
<link rel="modulepreload" href="/docs/diffusers/pr_11234/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/diffusers/pr_11234/en/_app/immutable/nodes/292.2f9a07b1.js">
<link rel="modulepreload" href="/docs/diffusers/pr_11234/en/_app/immutable/chunks/Tip.1d9b8c37.js">
<link rel="modulepreload" href="/docs/diffusers/pr_11234/en/_app/immutable/chunks/CodeBlock.a9c4becf.js">
<link rel="modulepreload" href="/docs/diffusers/pr_11234/en/_app/immutable/chunks/index.5d4ab994.js">
<link rel="modulepreload" href="/docs/diffusers/pr_11234/en/_app/immutable/chunks/HfOption.6ab18950.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Video generation&quot;,&quot;local&quot;:&quot;video-generation&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Configure model parameters&quot;,&quot;local&quot;:&quot;configure-model-parameters&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Number of frames&quot;,&quot;local&quot;:&quot;number-of-frames&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Guidance scale&quot;,&quot;local&quot;:&quot;guidance-scale&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Negative prompt&quot;,&quot;local&quot;:&quot;negative-prompt&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Model-specific parameters&quot;,&quot;local&quot;:&quot;model-specific-parameters&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Control video generation&quot;,&quot;local&quot;:&quot;control-video-generation&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Text2Video-Zero&quot;,&quot;local&quot;:&quot;text2video-zero&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Optimize&quot;,&quot;local&quot;:&quot;optimize&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Quantization&quot;,&quot;local&quot;:&quot;quantization&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="video-generation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#video-generation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Video generation</span></h1> <p data-svelte-h="svelte-152ax7i">Video generation models include a temporal dimension to bring images, or frames, together to create a video. These models are trained on large-scale datasets of high-quality text-video pairs to learn how to combine the modalities to ensure the generated video is coherent and realistic.</p> <p data-svelte-h="svelte-omkqh8"><a href="https://huggingface.co/models?other=video-generation" rel="nofollow">Explore</a> some of the more popular open-source video generation models available from Diffusers below.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">CogVideoX </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">HunyuanVideo </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">LTX-Video </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">Mochi-1 </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">StableVideoDiffusion </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">AnimateDiff </div></div> <div class="language-select"><p data-svelte-h="svelte-1c0gfm"><a href="https://huggingface.co/collections/THUDM/cogvideo-66c08e62f1685a3ade464cce" rel="nofollow">CogVideoX</a> uses a 3D causal Variational Autoencoder (VAE) to compress videos along the spatial and temporal dimensions, and it includes a stack of expert transformer blocks with a 3D full attention mechanism to better capture visual, semantic, and motion information in the data.</p> <p data-svelte-h="svelte-1nrq5rm">The CogVideoX family also includes models capable of generating videos from images and videos in addition to text. The image-to-video models are indicated by <strong>I2V</strong> in the checkpoint name, and they should be used with the <a href="/docs/diffusers/pr_11234/en/api/pipelines/cogvideox#diffusers.CogVideoXImageToVideoPipeline">CogVideoXImageToVideoPipeline</a>. The regular checkpoints support video-to-video through the <a href="/docs/diffusers/pr_11234/en/api/pipelines/cogvideox#diffusers.CogVideoXVideoToVideoPipeline">CogVideoXVideoToVideoPipeline</a>.</p> <p data-svelte-h="svelte-1799g9w">The example below demonstrates how to generate a video from an image and text prompt with <a href="https://huggingface.co/THUDM/CogVideoX-5b-I2V" rel="nofollow">THUDM/CogVideoX-5b-I2V</a>.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> CogVideoXImageToVideoPipeline
<span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> export_to_video, load_image
prompt = <span class="hljs-string">&quot;A vast, shimmering ocean flows gracefully under a twilight sky, its waves undulating in a mesmerizing dance of blues and greens. The surface glints with the last rays of the setting sun, casting golden highlights that ripple across the water. Seagulls soar above, their cries blending with the gentle roar of the waves. The horizon stretches infinitely, where the ocean meets the sky in a seamless blend of hues. Close-ups reveal the intricate patterns of the waves, capturing the fluidity and dynamic beauty of the sea in motion.&quot;</span>
image = load_image(image=<span class="hljs-string">&quot;https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cogvideox/cogvideox_rocket.png&quot;</span>)
pipe = CogVideoXImageToVideoPipeline.from_pretrained(
<span class="hljs-string">&quot;THUDM/CogVideoX-5b-I2V&quot;</span>,
torch_dtype=torch.bfloat16
)
<span class="hljs-comment"># reduce memory requirements </span>
pipe.vae.enable_tiling()
pipe.vae.enable_slicing()
video = pipe(
prompt=prompt,
image=image,
num_videos_per_prompt=<span class="hljs-number">1</span>,
num_inference_steps=<span class="hljs-number">50</span>,
num_frames=<span class="hljs-number">49</span>,
guidance_scale=<span class="hljs-number">6</span>,
generator=torch.Generator(device=<span class="hljs-string">&quot;cuda&quot;</span>).manual_seed(<span class="hljs-number">42</span>),
).frames[<span class="hljs-number">0</span>]
export_to_video(video, <span class="hljs-string">&quot;output.mp4&quot;</span>, fps=<span class="hljs-number">8</span>)<!-- HTML_TAG_END --></pre></div> <div class="flex gap-4" data-svelte-h="svelte-1u9aiw3"><div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cogvideox/cogvideox_rocket.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">initial image</figcaption></div> <div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cogvideox/cogvideox_outrocket.gif"> <figcaption class="mt-2 text-center text-sm text-gray-500">generated video</figcaption></div></div> </div> <h2 class="relative group"><a id="configure-model-parameters" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#configure-model-parameters"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Configure model parameters</span></h2> <p data-svelte-h="svelte-1bg8xrr">There are a few important parameters you can configure in the pipeline that’ll affect the video generation process and quality. Let’s take a closer look at what these parameters do and how changing them affects the output.</p> <h3 class="relative group"><a id="number-of-frames" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#number-of-frames"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Number of frames</span></h3> <p data-svelte-h="svelte-1he7va5">The <code>num_frames</code> parameter determines how many video frames are generated per second. A frame is an image that is played in a sequence of other frames to create motion or a video. This affects video length because the pipeline generates a certain number of frames per second (check a pipeline’s API reference for the default value). To increase the video duration, you’ll need to increase the <code>num_frames</code> parameter.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableVideoDiffusionPipeline
<span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> load_image, export_to_video
pipeline = StableVideoDiffusionPipeline.from_pretrained(
<span class="hljs-string">&quot;stabilityai/stable-video-diffusion-img2vid&quot;</span>, torch_dtype=torch.float16, variant=<span class="hljs-string">&quot;fp16&quot;</span>
)
pipeline.enable_model_cpu_offload()
image = load_image(<span class="hljs-string">&quot;https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png&quot;</span>)
image = image.resize((<span class="hljs-number">1024</span>, <span class="hljs-number">576</span>))
generator = torch.manual_seed(<span class="hljs-number">42</span>)
frames = pipeline(image, decode_chunk_size=<span class="hljs-number">8</span>, generator=generator, num_frames=<span class="hljs-number">25</span>).frames[<span class="hljs-number">0</span>]
export_to_video(frames, <span class="hljs-string">&quot;generated.mp4&quot;</span>, fps=<span class="hljs-number">7</span>)<!-- HTML_TAG_END --></pre></div> <div class="flex gap-4" data-svelte-h="svelte-1hb15b9"><div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/num_frames_14.gif"> <figcaption class="mt-2 text-center text-sm text-gray-500">num_frames=14</figcaption></div> <div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/num_frames_25.gif"> <figcaption class="mt-2 text-center text-sm text-gray-500">num_frames=25</figcaption></div></div> <h3 class="relative group"><a id="guidance-scale" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#guidance-scale"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Guidance scale</span></h3> <p data-svelte-h="svelte-10ah3pm">The <code>guidance_scale</code> parameter controls how closely aligned the generated video and text prompt or initial image is. A higher <code>guidance_scale</code> value means your generated video is more aligned with the text prompt or initial image, while a lower <code>guidance_scale</code> value means your generated video is less aligned which could give the model more “creativity” to interpret the conditioning input.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1i6okx6">SVD uses the <code>min_guidance_scale</code> and <code>max_guidance_scale</code> parameters for applying guidance to the first and last frames respectively.</p></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> I2VGenXLPipeline
<span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> export_to_gif, load_image
pipeline = I2VGenXLPipeline.from_pretrained(<span class="hljs-string">&quot;ali-vilab/i2vgen-xl&quot;</span>, torch_dtype=torch.float16, variant=<span class="hljs-string">&quot;fp16&quot;</span>)
pipeline.enable_model_cpu_offload()
image_url = <span class="hljs-string">&quot;https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png&quot;</span>
image = load_image(image_url).convert(<span class="hljs-string">&quot;RGB&quot;</span>)
prompt = <span class="hljs-string">&quot;Papers were floating in the air on a table in the library&quot;</span>
negative_prompt = <span class="hljs-string">&quot;Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms&quot;</span>
generator = torch.manual_seed(<span class="hljs-number">0</span>)
frames = pipeline(
prompt=prompt,
image=image,
num_inference_steps=<span class="hljs-number">50</span>,
negative_prompt=negative_prompt,
guidance_scale=<span class="hljs-number">1.0</span>,
generator=generator
).frames[<span class="hljs-number">0</span>]
export_to_gif(frames, <span class="hljs-string">&quot;i2v.gif&quot;</span>)<!-- HTML_TAG_END --></pre></div> <div class="flex gap-4" data-svelte-h="svelte-1491ymz"><div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/i2vgen-xl-example.gif"> <figcaption class="mt-2 text-center text-sm text-gray-500">guidance_scale=9.0</figcaption></div> <div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/guidance_scale_1.0.gif"> <figcaption class="mt-2 text-center text-sm text-gray-500">guidance_scale=1.0</figcaption></div></div> <h3 class="relative group"><a id="negative-prompt" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#negative-prompt"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Negative prompt</span></h3> <p data-svelte-h="svelte-xdqe9k">A negative prompt deters the model from generating things you don’t want it to. This parameter is commonly used to improve overall generation quality by removing poor or bad features such as “low resolution” or “bad details”.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> AnimateDiffPipeline, DDIMScheduler, MotionAdapter
<span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> export_to_gif
adapter = MotionAdapter.from_pretrained(<span class="hljs-string">&quot;guoyww/animatediff-motion-adapter-v1-5-2&quot;</span>, torch_dtype=torch.float16)
pipeline = AnimateDiffPipeline.from_pretrained(<span class="hljs-string">&quot;emilianJR/epiCRealism&quot;</span>, motion_adapter=adapter, torch_dtype=torch.float16)
scheduler = DDIMScheduler.from_pretrained(
<span class="hljs-string">&quot;emilianJR/epiCRealism&quot;</span>,
subfolder=<span class="hljs-string">&quot;scheduler&quot;</span>,
clip_sample=<span class="hljs-literal">False</span>,
timestep_spacing=<span class="hljs-string">&quot;linspace&quot;</span>,
beta_schedule=<span class="hljs-string">&quot;linear&quot;</span>,
steps_offset=<span class="hljs-number">1</span>,
)
pipeline.scheduler = scheduler
pipeline.enable_vae_slicing()
pipeline.enable_model_cpu_offload()
output = pipeline(
prompt=<span class="hljs-string">&quot;360 camera shot of a sushi roll in a restaurant&quot;</span>,
negative_prompt=<span class="hljs-string">&quot;Distorted, discontinuous, ugly, blurry, low resolution, motionless, static&quot;</span>,
num_frames=<span class="hljs-number">16</span>,
guidance_scale=<span class="hljs-number">7.5</span>,
num_inference_steps=<span class="hljs-number">50</span>,
generator=torch.Generator(<span class="hljs-string">&quot;cpu&quot;</span>).manual_seed(<span class="hljs-number">0</span>),
)
frames = output.frames[<span class="hljs-number">0</span>]
export_to_gif(frames, <span class="hljs-string">&quot;animation.gif&quot;</span>)<!-- HTML_TAG_END --></pre></div> <div class="flex gap-4" data-svelte-h="svelte-139sag9"><div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff_no_neg.gif"> <figcaption class="mt-2 text-center text-sm text-gray-500">no negative prompt</figcaption></div> <div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff_neg.gif"> <figcaption class="mt-2 text-center text-sm text-gray-500">negative prompt applied</figcaption></div></div> <h3 class="relative group"><a id="model-specific-parameters" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#model-specific-parameters"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Model-specific parameters</span></h3> <p data-svelte-h="svelte-1d2wgj6">There are some pipeline parameters that are unique to each model such as adjusting the motion in a video or adding noise to the initial image.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">Stable Video Diffusion </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">Text2Video-Zero </div></div> <div class="language-select"><p data-svelte-h="svelte-17vrdx1">Stable Video Diffusion provides additional micro-conditioning for the frame rate with the <code>fps</code> parameter and for motion with the <code>motion_bucket_id</code> parameter. Together, these parameters allow for adjusting the amount of motion in the generated video.</p> <p data-svelte-h="svelte-1rvhfxn">There is also a <code>noise_aug_strength</code> parameter that increases the amount of noise added to the initial image. Varying this parameter affects how similar the generated video and initial image are. A higher <code>noise_aug_strength</code> also increases the amount of motion. To learn more, read the <a href="../using-diffusers/svd#micro-conditioning">Micro-conditioning</a> guide.</p> </div> <h2 class="relative group"><a id="control-video-generation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#control-video-generation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Control video generation</span></h2> <p data-svelte-h="svelte-1k7xqn2">Video generation can be controlled similar to how text-to-image, image-to-image, and inpainting can be controlled with a <a href="/docs/diffusers/pr_11234/en/api/models/controlnet#diffusers.ControlNetModel">ControlNetModel</a>. The only difference is you need to use the <a href="/docs/diffusers/pr_11234/en/api/attnprocessor#diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.CrossFrameAttnProcessor">CrossFrameAttnProcessor</a> so each frame attends to the first frame.</p> <h3 class="relative group"><a id="text2video-zero" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#text2video-zero"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Text2Video-Zero</span></h3> <p data-svelte-h="svelte-w25jvx">Text2Video-Zero video generation can be conditioned on pose and edge images for even greater control over a subject’s motion in the generated video or to preserve the identity of a subject/object in the video. You can also use Text2Video-Zero with <a href="../api/pipelines/pix2pix">InstructPix2Pix</a> for editing videos with text.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">pose control </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">edge control </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">InstructPix2Pix </div></div> <div class="language-select"><p data-svelte-h="svelte-iz4nwy">Start by downloading a video and extracting the pose images from it.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> hf_hub_download
<span class="hljs-keyword">from</span> PIL <span class="hljs-keyword">import</span> Image
<span class="hljs-keyword">import</span> imageio
filename = <span class="hljs-string">&quot;__assets__/poses_skeleton_gifs/dance1_corr.mp4&quot;</span>
repo_id = <span class="hljs-string">&quot;PAIR/Text2Video-Zero&quot;</span>
video_path = hf_hub_download(repo_type=<span class="hljs-string">&quot;space&quot;</span>, repo_id=repo_id, filename=filename)
reader = imageio.get_reader(video_path, <span class="hljs-string">&quot;ffmpeg&quot;</span>)
frame_count = <span class="hljs-number">8</span>
pose_images = [Image.fromarray(reader.get_data(i)) <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(frame_count)]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ceib0j">Load a <a href="/docs/diffusers/pr_11234/en/api/models/controlnet#diffusers.ControlNetModel">ControlNetModel</a> for pose estimation and a checkpoint into the <a href="/docs/diffusers/pr_11234/en/api/pipelines/controlnet#diffusers.StableDiffusionControlNetPipeline">StableDiffusionControlNetPipeline</a>. Then you’ll use the <a href="/docs/diffusers/pr_11234/en/api/attnprocessor#diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.CrossFrameAttnProcessor">CrossFrameAttnProcessor</a> for the UNet and ControlNet.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionControlNetPipeline, ControlNetModel
<span class="hljs-keyword">from</span> diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero <span class="hljs-keyword">import</span> CrossFrameAttnProcessor
model_id = <span class="hljs-string">&quot;stable-diffusion-v1-5/stable-diffusion-v1-5&quot;</span>
controlnet = ControlNetModel.from_pretrained(<span class="hljs-string">&quot;lllyasviel/sd-controlnet-openpose&quot;</span>, torch_dtype=torch.float16)
pipeline = StableDiffusionControlNetPipeline.from_pretrained(
model_id, controlnet=controlnet, torch_dtype=torch.float16
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
pipeline.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=<span class="hljs-number">2</span>))
pipeline.controlnet.set_attn_processor(CrossFrameAttnProcessor(batch_size=<span class="hljs-number">2</span>))<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-se6rgz">Fix the latents for all the frames, and then pass your prompt and extracted pose images to the model to generate a video.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->latents = torch.randn((<span class="hljs-number">1</span>, <span class="hljs-number">4</span>, <span class="hljs-number">64</span>, <span class="hljs-number">64</span>), device=<span class="hljs-string">&quot;cuda&quot;</span>, dtype=torch.float16).repeat(<span class="hljs-built_in">len</span>(pose_images), <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>)
prompt = <span class="hljs-string">&quot;Darth Vader dancing in a desert&quot;</span>
result = pipeline(prompt=[prompt] * <span class="hljs-built_in">len</span>(pose_images), image=pose_images, latents=latents).images
imageio.mimsave(<span class="hljs-string">&quot;video.mp4&quot;</span>, result, fps=<span class="hljs-number">4</span>)<!-- HTML_TAG_END --></pre></div> </div> <h2 class="relative group"><a id="optimize" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#optimize"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Optimize</span></h2> <p data-svelte-h="svelte-i4oodb">Video generation requires a lot of memory because you’re generating many video frames at once. You can reduce your memory requirements at the expense of some inference speed. Try:</p> <ol data-svelte-h="svelte-4xypfn"><li>offloading pipeline components that are no longer needed to the CPU</li> <li>feed-forward chunking runs the feed-forward layer in a loop instead of all at once</li> <li>break up the number of frames the VAE has to decode into chunks instead of decoding them all at once</li></ol> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-deletion">- pipeline.enable_model_cpu_offload()</span>
<span class="hljs-deletion">- frames = pipeline(image, decode_chunk_size=8, generator=generator).frames[0]</span>
<span class="hljs-addition">+ pipeline.enable_model_cpu_offload()</span>
<span class="hljs-addition">+ pipeline.unet.enable_forward_chunking()</span>
<span class="hljs-addition">+ frames = pipeline(image, decode_chunk_size=2, generator=generator, num_frames=25).frames[0]</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1dynxrt">If memory is not an issue and you want to optimize for speed, try wrapping the UNet with <a href="../optimization/torch2.0#torchcompile"><code>torch.compile</code></a>.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-deletion">- pipeline.enable_model_cpu_offload()</span>
<span class="hljs-addition">+ pipeline.to(&quot;cuda&quot;)</span>
<span class="hljs-addition">+ pipeline.unet = torch.compile(pipeline.unet, mode=&quot;reduce-overhead&quot;, fullgraph=True)</span><!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="quantization" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#quantization"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Quantization</span></h2> <p data-svelte-h="svelte-1ou2pxc">Quantization helps reduce the memory requirements of very large models by storing model weights in a lower precision data type. However, quantization may have varying impact on video quality depending on the video model.</p> <p data-svelte-h="svelte-1cvnbkg">Refer to the <a href="../../quantization/overview">Quantization</a> to learn more about supported quantization backends (bitsandbytes, torchao, gguf) and selecting a quantization backend that supports your use case.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/diffusers/blob/main/docs/source/en/using-diffusers/text-img2vid.md" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_225bzv = {
assets: "/docs/diffusers/pr_11234/en",
base: "/docs/diffusers/pr_11234/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/diffusers/pr_11234/en/_app/immutable/entry/start.5a617280.js"),
import("/docs/diffusers/pr_11234/en/_app/immutable/entry/app.1a1de8b6.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 292],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
49.7 kB
·
Xet hash:
5330f435f53b4dc9b39821cfd0aa88f832d660b7bc8c5404b7223f9e0b67d0a4

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.