Buckets:
| <meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Text or image-to-video","local":"text-or-image-to-video","sections":[{"title":"Popular models","local":"popular-models","sections":[{"title":"CogVideoX","local":"cogvideox","sections":[],"depth":3},{"title":"Stable Video Diffusion","local":"stable-video-diffusion","sections":[],"depth":3},{"title":"I2VGen-XL","local":"i2vgen-xl","sections":[],"depth":3},{"title":"AnimateDiff","local":"animatediff","sections":[],"depth":3},{"title":"ModelscopeT2V","local":"modelscopet2v","sections":[],"depth":3}],"depth":2},{"title":"Configure model parameters","local":"configure-model-parameters","sections":[{"title":"Number of frames","local":"number-of-frames","sections":[],"depth":3},{"title":"Guidance scale","local":"guidance-scale","sections":[],"depth":3},{"title":"Negative prompt","local":"negative-prompt","sections":[],"depth":3},{"title":"Model-specific parameters","local":"model-specific-parameters","sections":[],"depth":3}],"depth":2},{"title":"Control video generation","local":"control-video-generation","sections":[{"title":"Text2Video-Zero","local":"text2video-zero","sections":[],"depth":3}],"depth":2},{"title":"Optimize","local":"optimize","sections":[],"depth":2}],"depth":1}"> | |
| <link href="/docs/diffusers/pr_10312/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_10312/en/_app/immutable/entry/start.203b6290.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_10312/en/_app/immutable/chunks/scheduler.8c3d61f6.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_10312/en/_app/immutable/chunks/singletons.8b179f45.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_10312/en/_app/immutable/chunks/index.0997d446.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_10312/en/_app/immutable/chunks/paths.3fd58d56.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_10312/en/_app/immutable/entry/app.423ea23f.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_10312/en/_app/immutable/chunks/index.da70eac4.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_10312/en/_app/immutable/nodes/0.e544eae6.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_10312/en/_app/immutable/chunks/each.e59479a4.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_10312/en/_app/immutable/nodes/266.ce4989c5.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_10312/en/_app/immutable/chunks/Tip.1d9b8c37.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_10312/en/_app/immutable/chunks/CodeBlock.00a903b3.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_10312/en/_app/immutable/chunks/EditOnGithub.1e64e623.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_10312/en/_app/immutable/chunks/HfOption.c1483eb1.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_10312/en/_app/immutable/chunks/stores.d6eecc38.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Text or image-to-video","local":"text-or-image-to-video","sections":[{"title":"Popular models","local":"popular-models","sections":[{"title":"CogVideoX","local":"cogvideox","sections":[],"depth":3},{"title":"Stable Video Diffusion","local":"stable-video-diffusion","sections":[],"depth":3},{"title":"I2VGen-XL","local":"i2vgen-xl","sections":[],"depth":3},{"title":"AnimateDiff","local":"animatediff","sections":[],"depth":3},{"title":"ModelscopeT2V","local":"modelscopet2v","sections":[],"depth":3}],"depth":2},{"title":"Configure model parameters","local":"configure-model-parameters","sections":[{"title":"Number of frames","local":"number-of-frames","sections":[],"depth":3},{"title":"Guidance scale","local":"guidance-scale","sections":[],"depth":3},{"title":"Negative prompt","local":"negative-prompt","sections":[],"depth":3},{"title":"Model-specific parameters","local":"model-specific-parameters","sections":[],"depth":3}],"depth":2},{"title":"Control video generation","local":"control-video-generation","sections":[{"title":"Text2Video-Zero","local":"text2video-zero","sections":[],"depth":3}],"depth":2},{"title":"Optimize","local":"optimize","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="text-or-image-to-video" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#text-or-image-to-video"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Text or image-to-video</span></h1> <p data-svelte-h="svelte-1gf6o60">Driven by the success of text-to-image diffusion models, generative video models are able to generate short clips of video from a text prompt or an initial image. These models extend a pretrained diffusion model to generate videos by adding some type of temporal and/or spatial convolution layer to the architecture. A mixed dataset of images and videos are used to train the model which learns to output a series of video frames based on the text or image conditioning.</p> <p data-svelte-h="svelte-483wvv">This guide will show you how to generate videos, how to configure video model parameters, and how to control video generation.</p> <h2 class="relative group"><a id="popular-models" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#popular-models"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Popular models</span></h2> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-af52q">Discover other cool and trending video generation models on the Hub <a href="https://huggingface.co/models?pipeline_tag=text-to-video&sort=trending" rel="nofollow">here</a>!</p></div> <p data-svelte-h="svelte-earwrk"><a href="https://huggingface.co/stabilityai/stable-video-diffusion-img2vid" rel="nofollow">Stable Video Diffusions (SVD)</a>, <a href="https://huggingface.co/ali-vilab/i2vgen-xl/" rel="nofollow">I2VGen-XL</a>, <a href="https://huggingface.co/guoyww/animatediff" rel="nofollow">AnimateDiff</a>, and <a href="https://huggingface.co/ali-vilab/text-to-video-ms-1.7b" rel="nofollow">ModelScopeT2V</a> are popular models used for video diffusion. Each model is distinct. For example, AnimateDiff inserts a motion modeling module into a frozen text-to-image model to generate personalized animated images, whereas SVD is entirely pretrained from scratch with a three-stage training process to generate short high-quality videos.</p> <p data-svelte-h="svelte-vkguzc"><a href="https://huggingface.co/collections/THUDM/cogvideo-66c08e62f1685a3ade464cce" rel="nofollow">CogVideoX</a> is another popular video generation model. The model is a multidimensional transformer that integrates text, time, and space. It employs full attention in the attention module and includes an expert block at the layer level to spatially align text and video.</p> <h3 class="relative group"><a id="cogvideox" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#cogvideox"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>CogVideoX</span></h3> <p data-svelte-h="svelte-1jbs1xy"><a href="../api/pipelines/cogvideox">CogVideoX</a> uses a 3D Variational Autoencoder (VAE) to compress videos along the spatial and temporal dimensions.</p> <p data-svelte-h="svelte-9j6tzp">Begin by loading the <a href="/docs/diffusers/pr_10312/en/api/pipelines/cogvideox#diffusers.CogVideoXPipeline">CogVideoXPipeline</a> and passing an initial text or image to generate a video.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1g2bqxo">CogVideoX is available for image-to-video and text-to-video. <a href="https://huggingface.co/THUDM/CogVideoX-5b-I2V" rel="nofollow">THUDM/CogVideoX-5b-I2V</a> uses the <a href="/docs/diffusers/pr_10312/en/api/pipelines/cogvideox#diffusers.CogVideoXImageToVideoPipeline">CogVideoXImageToVideoPipeline</a> for image-to-video. <a href="https://huggingface.co/THUDM/CogVideoX-5b" rel="nofollow">THUDM/CogVideoX-5b</a> and <a href="https://huggingface.co/THUDM/CogVideoX-2b" rel="nofollow">THUDM/CogVideoX-2b</a> are available for text-to-video with the <a href="/docs/diffusers/pr_10312/en/api/pipelines/cogvideox#diffusers.CogVideoXPipeline">CogVideoXPipeline</a>.</p></div> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch | |
| <span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> CogVideoXImageToVideoPipeline | |
| <span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> export_to_video, load_image | |
| prompt = <span class="hljs-string">"A vast, shimmering ocean flows gracefully under a twilight sky, its waves undulating in a mesmerizing dance of blues and greens. The surface glints with the last rays of the setting sun, casting golden highlights that ripple across the water. Seagulls soar above, their cries blending with the gentle roar of the waves. The horizon stretches infinitely, where the ocean meets the sky in a seamless blend of hues. Close-ups reveal the intricate patterns of the waves, capturing the fluidity and dynamic beauty of the sea in motion."</span> | |
| image = load_image(image=<span class="hljs-string">"cogvideox_rocket.png"</span>) | |
| pipe = CogVideoXImageToVideoPipeline.from_pretrained( | |
| <span class="hljs-string">"THUDM/CogVideoX-5b-I2V"</span>, | |
| torch_dtype=torch.bfloat16 | |
| ) | |
| pipe.vae.enable_tiling() | |
| pipe.vae.enable_slicing() | |
| video = pipe( | |
| prompt=prompt, | |
| image=image, | |
| num_videos_per_prompt=<span class="hljs-number">1</span>, | |
| num_inference_steps=<span class="hljs-number">50</span>, | |
| num_frames=<span class="hljs-number">49</span>, | |
| guidance_scale=<span class="hljs-number">6</span>, | |
| generator=torch.Generator(device=<span class="hljs-string">"cuda"</span>).manual_seed(<span class="hljs-number">42</span>), | |
| ).frames[<span class="hljs-number">0</span>] | |
| export_to_video(video, <span class="hljs-string">"output.mp4"</span>, fps=<span class="hljs-number">8</span>)<!-- HTML_TAG_END --></pre></div> <div class="flex gap-4" data-svelte-h="svelte-1u9aiw3"><div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cogvideox/cogvideox_rocket.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">initial image</figcaption></div> <div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cogvideox/cogvideox_outrocket.gif"> <figcaption class="mt-2 text-center text-sm text-gray-500">generated video</figcaption></div></div> <h3 class="relative group"><a id="stable-video-diffusion" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#stable-video-diffusion"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Stable Video Diffusion</span></h3> <p data-svelte-h="svelte-1s1nsr3"><a href="../api/pipelines/svd">SVD</a> is based on the Stable Diffusion 2.1 model and it is trained on images, then low-resolution videos, and finally a smaller dataset of high-resolution videos. This model generates a short 2-4 second video from an initial image. You can learn more details about model, like micro-conditioning, in the <a href="../using-diffusers/svd">Stable Video Diffusion</a> guide.</p> <p data-svelte-h="svelte-14ev7vs">Begin by loading the <a href="/docs/diffusers/pr_10312/en/api/pipelines/stable_diffusion/svd#diffusers.StableVideoDiffusionPipeline">StableVideoDiffusionPipeline</a> and passing an initial image to generate a video from.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch | |
| <span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableVideoDiffusionPipeline | |
| <span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> load_image, export_to_video | |
| pipeline = StableVideoDiffusionPipeline.from_pretrained( | |
| <span class="hljs-string">"stabilityai/stable-video-diffusion-img2vid-xt"</span>, torch_dtype=torch.float16, variant=<span class="hljs-string">"fp16"</span> | |
| ) | |
| pipeline.enable_model_cpu_offload() | |
| image = load_image(<span class="hljs-string">"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png"</span>) | |
| image = image.resize((<span class="hljs-number">1024</span>, <span class="hljs-number">576</span>)) | |
| generator = torch.manual_seed(<span class="hljs-number">42</span>) | |
| frames = pipeline(image, decode_chunk_size=<span class="hljs-number">8</span>, generator=generator).frames[<span class="hljs-number">0</span>] | |
| export_to_video(frames, <span class="hljs-string">"generated.mp4"</span>, fps=<span class="hljs-number">7</span>)<!-- HTML_TAG_END --></pre></div> <div class="flex gap-4" data-svelte-h="svelte-1oxa21d"><div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">initial image</figcaption></div> <div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/output_rocket.gif"> <figcaption class="mt-2 text-center text-sm text-gray-500">generated video</figcaption></div></div> <h3 class="relative group"><a id="i2vgen-xl" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#i2vgen-xl"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>I2VGen-XL</span></h3> <p data-svelte-h="svelte-1fectlj"><a href="../api/pipelines/i2vgenxl">I2VGen-XL</a> is a diffusion model that can generate higher resolution videos than SVD and it is also capable of accepting text prompts in addition to images. The model is trained with two hierarchical encoders (detail and global encoder) to better capture low and high-level details in images. These learned details are used to train a video diffusion model which refines the video resolution and details in the generated video.</p> <p data-svelte-h="svelte-s88vu7">You can use I2VGen-XL by loading the <a href="/docs/diffusers/pr_10312/en/api/pipelines/i2vgenxl#diffusers.I2VGenXLPipeline">I2VGenXLPipeline</a>, and passing a text and image prompt to generate a video.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch | |
| <span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> I2VGenXLPipeline | |
| <span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> export_to_gif, load_image | |
| pipeline = I2VGenXLPipeline.from_pretrained(<span class="hljs-string">"ali-vilab/i2vgen-xl"</span>, torch_dtype=torch.float16, variant=<span class="hljs-string">"fp16"</span>) | |
| pipeline.enable_model_cpu_offload() | |
| image_url = <span class="hljs-string">"https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png"</span> | |
| image = load_image(image_url).convert(<span class="hljs-string">"RGB"</span>) | |
| prompt = <span class="hljs-string">"Papers were floating in the air on a table in the library"</span> | |
| negative_prompt = <span class="hljs-string">"Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms"</span> | |
| generator = torch.manual_seed(<span class="hljs-number">8888</span>) | |
| frames = pipeline( | |
| prompt=prompt, | |
| image=image, | |
| num_inference_steps=<span class="hljs-number">50</span>, | |
| negative_prompt=negative_prompt, | |
| guidance_scale=<span class="hljs-number">9.0</span>, | |
| generator=generator | |
| ).frames[<span class="hljs-number">0</span>] | |
| export_to_gif(frames, <span class="hljs-string">"i2v.gif"</span>)<!-- HTML_TAG_END --></pre></div> <div class="flex gap-4" data-svelte-h="svelte-1fffm0t"><div><img class="rounded-xl" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png"> <figcaption class="mt-2 text-center text-sm text-gray-500">initial image</figcaption></div> <div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/i2vgen-xl-example.gif"> <figcaption class="mt-2 text-center text-sm text-gray-500">generated video</figcaption></div></div> <h3 class="relative group"><a id="animatediff" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#animatediff"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>AnimateDiff</span></h3> <p data-svelte-h="svelte-nk5pmd"><a href="../api/pipelines/animatediff">AnimateDiff</a> is an adapter model that inserts a motion module into a pretrained diffusion model to animate an image. The adapter is trained on video clips to learn motion which is used to condition the generation process to create a video. It is faster and easier to only train the adapter and it can be loaded into most diffusion models, effectively turning them into “video models”.</p> <p data-svelte-h="svelte-3rt1h8">Start by loading a <code>MotionAdapter</code>.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch | |
| <span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> AnimateDiffPipeline, DDIMScheduler, MotionAdapter | |
| <span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> export_to_gif | |
| adapter = MotionAdapter.from_pretrained(<span class="hljs-string">"guoyww/animatediff-motion-adapter-v1-5-2"</span>, torch_dtype=torch.float16)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1iuh0wc">Then load a finetuned Stable Diffusion model with the <a href="/docs/diffusers/pr_10312/en/api/pipelines/animatediff#diffusers.AnimateDiffPipeline">AnimateDiffPipeline</a>.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pipeline = AnimateDiffPipeline.from_pretrained(<span class="hljs-string">"emilianJR/epiCRealism"</span>, motion_adapter=adapter, torch_dtype=torch.float16) | |
| scheduler = DDIMScheduler.from_pretrained( | |
| <span class="hljs-string">"emilianJR/epiCRealism"</span>, | |
| subfolder=<span class="hljs-string">"scheduler"</span>, | |
| clip_sample=<span class="hljs-literal">False</span>, | |
| timestep_spacing=<span class="hljs-string">"linspace"</span>, | |
| beta_schedule=<span class="hljs-string">"linear"</span>, | |
| steps_offset=<span class="hljs-number">1</span>, | |
| ) | |
| pipeline.scheduler = scheduler | |
| pipeline.enable_vae_slicing() | |
| pipeline.enable_model_cpu_offload()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-y99twh">Create a prompt and generate the video.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->output = pipeline( | |
| prompt=<span class="hljs-string">"A space rocket with trails of smoke behind it launching into space from the desert, 4k, high resolution"</span>, | |
| negative_prompt=<span class="hljs-string">"bad quality, worse quality, low resolution"</span>, | |
| num_frames=<span class="hljs-number">16</span>, | |
| guidance_scale=<span class="hljs-number">7.5</span>, | |
| num_inference_steps=<span class="hljs-number">50</span>, | |
| generator=torch.Generator(<span class="hljs-string">"cpu"</span>).manual_seed(<span class="hljs-number">49</span>), | |
| ) | |
| frames = output.frames[<span class="hljs-number">0</span>] | |
| export_to_gif(frames, <span class="hljs-string">"animation.gif"</span>)<!-- HTML_TAG_END --></pre></div> <div class="flex justify-center" data-svelte-h="svelte-1s0a8zo"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff.gif"></div> <h3 class="relative group"><a id="modelscopet2v" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#modelscopet2v"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ModelscopeT2V</span></h3> <p data-svelte-h="svelte-zsx8hb"><a href="../api/pipelines/text_to_video">ModelscopeT2V</a> adds spatial and temporal convolutions and attention to a UNet, and it is trained on image-text and video-text datasets to enhance what it learns during training. The model takes a prompt, encodes it and creates text embeddings which are denoised by the UNet, and then decoded by a VQGAN into a video.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-12heei2">ModelScopeT2V generates watermarked videos due to the datasets it was trained on. To use a watermark-free model, try the <a href="https://huggingface.co/cerspense/zeroscope_v2_576w" rel="nofollow">cerspense/zeroscope_v2_76w</a> model with the <a href="/docs/diffusers/pr_10312/en/api/pipelines/text_to_video#diffusers.TextToVideoSDPipeline">TextToVideoSDPipeline</a> first, and then upscale it’s output with the <a href="https://huggingface.co/cerspense/zeroscope_v2_XL" rel="nofollow">cerspense/zeroscope_v2_XL</a> checkpoint using the <a href="/docs/diffusers/pr_10312/en/api/pipelines/text_to_video#diffusers.VideoToVideoSDPipeline">VideoToVideoSDPipeline</a>.</p></div> <p data-svelte-h="svelte-hoh078">Load a ModelScopeT2V checkpoint into the <a href="/docs/diffusers/pr_10312/en/api/pipelines/overview#diffusers.DiffusionPipeline">DiffusionPipeline</a> along with a prompt to generate a video.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch | |
| <span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> DiffusionPipeline | |
| <span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> export_to_video | |
| pipeline = DiffusionPipeline.from_pretrained(<span class="hljs-string">"damo-vilab/text-to-video-ms-1.7b"</span>, torch_dtype=torch.float16, variant=<span class="hljs-string">"fp16"</span>) | |
| pipeline.enable_model_cpu_offload() | |
| pipeline.enable_vae_slicing() | |
| prompt = <span class="hljs-string">"Confident teddy bear surfer rides the wave in the tropics"</span> | |
| video_frames = pipeline(prompt).frames[<span class="hljs-number">0</span>] | |
| export_to_video(video_frames, <span class="hljs-string">"modelscopet2v.mp4"</span>, fps=<span class="hljs-number">10</span>)<!-- HTML_TAG_END --></pre></div> <div class="flex justify-center" data-svelte-h="svelte-a7bnxv"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/modelscopet2v.gif"></div> <h2 class="relative group"><a id="configure-model-parameters" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#configure-model-parameters"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Configure model parameters</span></h2> <p data-svelte-h="svelte-1bg8xrr">There are a few important parameters you can configure in the pipeline that’ll affect the video generation process and quality. Let’s take a closer look at what these parameters do and how changing them affects the output.</p> <h3 class="relative group"><a id="number-of-frames" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#number-of-frames"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Number of frames</span></h3> <p data-svelte-h="svelte-1he7va5">The <code>num_frames</code> parameter determines how many video frames are generated per second. A frame is an image that is played in a sequence of other frames to create motion or a video. This affects video length because the pipeline generates a certain number of frames per second (check a pipeline’s API reference for the default value). To increase the video duration, you’ll need to increase the <code>num_frames</code> parameter.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch | |
| <span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableVideoDiffusionPipeline | |
| <span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> load_image, export_to_video | |
| pipeline = StableVideoDiffusionPipeline.from_pretrained( | |
| <span class="hljs-string">"stabilityai/stable-video-diffusion-img2vid"</span>, torch_dtype=torch.float16, variant=<span class="hljs-string">"fp16"</span> | |
| ) | |
| pipeline.enable_model_cpu_offload() | |
| image = load_image(<span class="hljs-string">"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png"</span>) | |
| image = image.resize((<span class="hljs-number">1024</span>, <span class="hljs-number">576</span>)) | |
| generator = torch.manual_seed(<span class="hljs-number">42</span>) | |
| frames = pipeline(image, decode_chunk_size=<span class="hljs-number">8</span>, generator=generator, num_frames=<span class="hljs-number">25</span>).frames[<span class="hljs-number">0</span>] | |
| export_to_video(frames, <span class="hljs-string">"generated.mp4"</span>, fps=<span class="hljs-number">7</span>)<!-- HTML_TAG_END --></pre></div> <div class="flex gap-4" data-svelte-h="svelte-1hb15b9"><div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/num_frames_14.gif"> <figcaption class="mt-2 text-center text-sm text-gray-500">num_frames=14</figcaption></div> <div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/num_frames_25.gif"> <figcaption class="mt-2 text-center text-sm text-gray-500">num_frames=25</figcaption></div></div> <h3 class="relative group"><a id="guidance-scale" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#guidance-scale"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Guidance scale</span></h3> <p data-svelte-h="svelte-10ah3pm">The <code>guidance_scale</code> parameter controls how closely aligned the generated video and text prompt or initial image is. A higher <code>guidance_scale</code> value means your generated video is more aligned with the text prompt or initial image, while a lower <code>guidance_scale</code> value means your generated video is less aligned which could give the model more “creativity” to interpret the conditioning input.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1i6okx6">SVD uses the <code>min_guidance_scale</code> and <code>max_guidance_scale</code> parameters for applying guidance to the first and last frames respectively.</p></div> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch | |
| <span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> I2VGenXLPipeline | |
| <span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> export_to_gif, load_image | |
| pipeline = I2VGenXLPipeline.from_pretrained(<span class="hljs-string">"ali-vilab/i2vgen-xl"</span>, torch_dtype=torch.float16, variant=<span class="hljs-string">"fp16"</span>) | |
| pipeline.enable_model_cpu_offload() | |
| image_url = <span class="hljs-string">"https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png"</span> | |
| image = load_image(image_url).convert(<span class="hljs-string">"RGB"</span>) | |
| prompt = <span class="hljs-string">"Papers were floating in the air on a table in the library"</span> | |
| negative_prompt = <span class="hljs-string">"Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms"</span> | |
| generator = torch.manual_seed(<span class="hljs-number">0</span>) | |
| frames = pipeline( | |
| prompt=prompt, | |
| image=image, | |
| num_inference_steps=<span class="hljs-number">50</span>, | |
| negative_prompt=negative_prompt, | |
| guidance_scale=<span class="hljs-number">1.0</span>, | |
| generator=generator | |
| ).frames[<span class="hljs-number">0</span>] | |
| export_to_gif(frames, <span class="hljs-string">"i2v.gif"</span>)<!-- HTML_TAG_END --></pre></div> <div class="flex gap-4" data-svelte-h="svelte-1491ymz"><div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/i2vgen-xl-example.gif"> <figcaption class="mt-2 text-center text-sm text-gray-500">guidance_scale=9.0</figcaption></div> <div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/guidance_scale_1.0.gif"> <figcaption class="mt-2 text-center text-sm text-gray-500">guidance_scale=1.0</figcaption></div></div> <h3 class="relative group"><a id="negative-prompt" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#negative-prompt"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Negative prompt</span></h3> <p data-svelte-h="svelte-xdqe9k">A negative prompt deters the model from generating things you don’t want it to. This parameter is commonly used to improve overall generation quality by removing poor or bad features such as “low resolution” or “bad details”.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch | |
| <span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> AnimateDiffPipeline, DDIMScheduler, MotionAdapter | |
| <span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> export_to_gif | |
| adapter = MotionAdapter.from_pretrained(<span class="hljs-string">"guoyww/animatediff-motion-adapter-v1-5-2"</span>, torch_dtype=torch.float16) | |
| pipeline = AnimateDiffPipeline.from_pretrained(<span class="hljs-string">"emilianJR/epiCRealism"</span>, motion_adapter=adapter, torch_dtype=torch.float16) | |
| scheduler = DDIMScheduler.from_pretrained( | |
| <span class="hljs-string">"emilianJR/epiCRealism"</span>, | |
| subfolder=<span class="hljs-string">"scheduler"</span>, | |
| clip_sample=<span class="hljs-literal">False</span>, | |
| timestep_spacing=<span class="hljs-string">"linspace"</span>, | |
| beta_schedule=<span class="hljs-string">"linear"</span>, | |
| steps_offset=<span class="hljs-number">1</span>, | |
| ) | |
| pipeline.scheduler = scheduler | |
| pipeline.enable_vae_slicing() | |
| pipeline.enable_model_cpu_offload() | |
| output = pipeline( | |
| prompt=<span class="hljs-string">"360 camera shot of a sushi roll in a restaurant"</span>, | |
| negative_prompt=<span class="hljs-string">"Distorted, discontinuous, ugly, blurry, low resolution, motionless, static"</span>, | |
| num_frames=<span class="hljs-number">16</span>, | |
| guidance_scale=<span class="hljs-number">7.5</span>, | |
| num_inference_steps=<span class="hljs-number">50</span>, | |
| generator=torch.Generator(<span class="hljs-string">"cpu"</span>).manual_seed(<span class="hljs-number">0</span>), | |
| ) | |
| frames = output.frames[<span class="hljs-number">0</span>] | |
| export_to_gif(frames, <span class="hljs-string">"animation.gif"</span>)<!-- HTML_TAG_END --></pre></div> <div class="flex gap-4" data-svelte-h="svelte-139sag9"><div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff_no_neg.gif"> <figcaption class="mt-2 text-center text-sm text-gray-500">no negative prompt</figcaption></div> <div><img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff_neg.gif"> <figcaption class="mt-2 text-center text-sm text-gray-500">negative prompt applied</figcaption></div></div> <h3 class="relative group"><a id="model-specific-parameters" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#model-specific-parameters"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Model-specific parameters</span></h3> <p data-svelte-h="svelte-1d2wgj6">There are some pipeline parameters that are unique to each model such as adjusting the motion in a video or adding noise to the initial image.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">Stable Video Diffusion </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">Text2Video-Zero </div></div> <div class="language-select"><p data-svelte-h="svelte-17vrdx1">Stable Video Diffusion provides additional micro-conditioning for the frame rate with the <code>fps</code> parameter and for motion with the <code>motion_bucket_id</code> parameter. Together, these parameters allow for adjusting the amount of motion in the generated video.</p> <p data-svelte-h="svelte-1rvhfxn">There is also a <code>noise_aug_strength</code> parameter that increases the amount of noise added to the initial image. Varying this parameter affects how similar the generated video and initial image are. A higher <code>noise_aug_strength</code> also increases the amount of motion. To learn more, read the <a href="../using-diffusers/svd#micro-conditioning">Micro-conditioning</a> guide.</p> </div> <h2 class="relative group"><a id="control-video-generation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#control-video-generation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Control video generation</span></h2> <p data-svelte-h="svelte-kknsbm">Video generation can be controlled similar to how text-to-image, image-to-image, and inpainting can be controlled with a <a href="/docs/diffusers/pr_10312/en/api/models/controlnet#diffusers.ControlNetModel">ControlNetModel</a>. The only difference is you need to use the <a href="/docs/diffusers/pr_10312/en/api/attnprocessor#diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.CrossFrameAttnProcessor">CrossFrameAttnProcessor</a> so each frame attends to the first frame.</p> <h3 class="relative group"><a id="text2video-zero" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#text2video-zero"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Text2Video-Zero</span></h3> <p data-svelte-h="svelte-w25jvx">Text2Video-Zero video generation can be conditioned on pose and edge images for even greater control over a subject’s motion in the generated video or to preserve the identity of a subject/object in the video. You can also use Text2Video-Zero with <a href="../api/pipelines/pix2pix">InstructPix2Pix</a> for editing videos with text.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">pose control </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">edge control </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">InstructPix2Pix </div></div> <div class="language-select"><p data-svelte-h="svelte-iz4nwy">Start by downloading a video and extracting the pose images from it.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> hf_hub_download | |
| <span class="hljs-keyword">from</span> PIL <span class="hljs-keyword">import</span> Image | |
| <span class="hljs-keyword">import</span> imageio | |
| filename = <span class="hljs-string">"__assets__/poses_skeleton_gifs/dance1_corr.mp4"</span> | |
| repo_id = <span class="hljs-string">"PAIR/Text2Video-Zero"</span> | |
| video_path = hf_hub_download(repo_type=<span class="hljs-string">"space"</span>, repo_id=repo_id, filename=filename) | |
| reader = imageio.get_reader(video_path, <span class="hljs-string">"ffmpeg"</span>) | |
| frame_count = <span class="hljs-number">8</span> | |
| pose_images = [Image.fromarray(reader.get_data(i)) <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(frame_count)]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-14niav1">Load a <a href="/docs/diffusers/pr_10312/en/api/models/controlnet#diffusers.ControlNetModel">ControlNetModel</a> for pose estimation and a checkpoint into the <a href="/docs/diffusers/pr_10312/en/api/pipelines/controlnet#diffusers.StableDiffusionControlNetPipeline">StableDiffusionControlNetPipeline</a>. Then you’ll use the <a href="/docs/diffusers/pr_10312/en/api/attnprocessor#diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.CrossFrameAttnProcessor">CrossFrameAttnProcessor</a> for the UNet and ControlNet.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch | |
| <span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> StableDiffusionControlNetPipeline, ControlNetModel | |
| <span class="hljs-keyword">from</span> diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero <span class="hljs-keyword">import</span> CrossFrameAttnProcessor | |
| model_id = <span class="hljs-string">"stable-diffusion-v1-5/stable-diffusion-v1-5"</span> | |
| controlnet = ControlNetModel.from_pretrained(<span class="hljs-string">"lllyasviel/sd-controlnet-openpose"</span>, torch_dtype=torch.float16) | |
| pipeline = StableDiffusionControlNetPipeline.from_pretrained( | |
| model_id, controlnet=controlnet, torch_dtype=torch.float16 | |
| ).to(<span class="hljs-string">"cuda"</span>) | |
| pipeline.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=<span class="hljs-number">2</span>)) | |
| pipeline.controlnet.set_attn_processor(CrossFrameAttnProcessor(batch_size=<span class="hljs-number">2</span>))<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-se6rgz">Fix the latents for all the frames, and then pass your prompt and extracted pose images to the model to generate a video.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->latents = torch.randn((<span class="hljs-number">1</span>, <span class="hljs-number">4</span>, <span class="hljs-number">64</span>, <span class="hljs-number">64</span>), device=<span class="hljs-string">"cuda"</span>, dtype=torch.float16).repeat(<span class="hljs-built_in">len</span>(pose_images), <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>) | |
| prompt = <span class="hljs-string">"Darth Vader dancing in a desert"</span> | |
| result = pipeline(prompt=[prompt] * <span class="hljs-built_in">len</span>(pose_images), image=pose_images, latents=latents).images | |
| imageio.mimsave(<span class="hljs-string">"video.mp4"</span>, result, fps=<span class="hljs-number">4</span>)<!-- HTML_TAG_END --></pre></div> </div> <h2 class="relative group"><a id="optimize" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#optimize"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Optimize</span></h2> <p data-svelte-h="svelte-i4oodb">Video generation requires a lot of memory because you’re generating many video frames at once. You can reduce your memory requirements at the expense of some inference speed. Try:</p> <ol data-svelte-h="svelte-4xypfn"><li>offloading pipeline components that are no longer needed to the CPU</li> <li>feed-forward chunking runs the feed-forward layer in a loop instead of all at once</li> <li>break up the number of frames the VAE has to decode into chunks instead of decoding them all at once</li></ol> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-deletion">- pipeline.enable_model_cpu_offload()</span> | |
| <span class="hljs-deletion">- frames = pipeline(image, decode_chunk_size=8, generator=generator).frames[0]</span> | |
| <span class="hljs-addition">+ pipeline.enable_model_cpu_offload()</span> | |
| <span class="hljs-addition">+ pipeline.unet.enable_forward_chunking()</span> | |
| <span class="hljs-addition">+ frames = pipeline(image, decode_chunk_size=2, generator=generator, num_frames=25).frames[0]</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1dynxrt">If memory is not an issue and you want to optimize for speed, try wrapping the UNet with <a href="../optimization/torch2.0#torchcompile"><code>torch.compile</code></a>.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-deletion">- pipeline.enable_model_cpu_offload()</span> | |
| <span class="hljs-addition">+ pipeline.to("cuda")</span> | |
| <span class="hljs-addition">+ pipeline.unet = torch.compile(pipeline.unet, mode="reduce-overhead", fullgraph=True)</span><!-- HTML_TAG_END --></pre></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/diffusers/blob/main/docs/source/en/using-diffusers/text-img2vid.md" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p> | |
| <script> | |
| { | |
| __sveltekit_zvhs44 = { | |
| assets: "/docs/diffusers/pr_10312/en", | |
| base: "/docs/diffusers/pr_10312/en", | |
| env: {} | |
| }; | |
| const element = document.currentScript.parentElement; | |
| const data = [null,null]; | |
| Promise.all([ | |
| import("/docs/diffusers/pr_10312/en/_app/immutable/entry/start.203b6290.js"), | |
| import("/docs/diffusers/pr_10312/en/_app/immutable/entry/app.423ea23f.js") | |
| ]).then(([kit, app]) => { | |
| kit.start(app, element, { | |
| node_ids: [0, 266], | |
| data, | |
| form: null, | |
| error: null | |
| }); | |
| }); | |
| } | |
| </script> | |
Xet Storage Details
- Size:
- 74.4 kB
- Xet hash:
- ca6c7243258922f521736ffbbd30c67e778505c059ff34ca7cdee1434213bdc7
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.