Buckets:

hf-doc-build
/

doc-dev

hf-doc-build/doc-dev / diffusers /pr_12087 /en /_app /immutable /nodes /167.b4e6e947.js

rtrm's picture

28 days ago

11.6 kB

	import{s as ve,o as be,n as $e}from"../chunks/scheduler.8c3d61f6.js";import{S as _e,i as Pe,g as l,s as o,r as P,A as we,h as f,f as i,c as a,j as ne,u as w,x as h,k as oe,y as O,a as n,v as y,d as D,t as L,w as x}from"../chunks/index.da70eac4.js";import{T as ye}from"../chunks/Tip.1d9b8c37.js";import{D as he}from"../chunks/Docstring.c021b19a.js";import{H as ae,E as De}from"../chunks/getInferenceSnippets.725ed3d4.js";function Le(W){let s,V='To learn how to use Stable Video Diffusion, take a look at the <a href="../../../using-diffusers/svd">Stable Video Diffusion</a> guide.',m,v,d,u,c='Check out the <a href="https://huggingface.co/stabilityai" rel="nofollow">Stability AI</a> Hub organization for the <a href="https://huggingface.co/stabilityai/stable-video-diffusion-img2vid" rel="nofollow">base</a> and <a href="https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt" rel="nofollow">extended frame</a> checkpoints!';return{c(){s=l("p"),s.innerHTML=V,m=o(),v=l("br"),d=o(),u=l("p"),u.innerHTML=c},l(r){s=f(r,"P",{"data-svelte-h":!0}),h(s)!=="svelte-142ytm0"&&(s.innerHTML=V),m=a(r),v=f(r,"BR",{}),d=a(r),u=f(r,"P",{"data-svelte-h":!0}),h(u)!=="svelte-1we98lt"&&(u.innerHTML=c)},m(r,g){n(r,s,g),n(r,m,g),n(r,v,g),n(r,d,g),n(r,u,g)},p:$e,d(r){r&&(i(s),i(m),i(v),i(d),i(u))}}}function xe(W){let s,V,m,v,d,u,c,r='Stable Video Diffusion was proposed in <a href="https://hf.co/papers/2311.15127" rel="nofollow">Stable Video Diffusion: Scaling Latent Video Diffusion Models to Large Datasets</a> by Andreas Blattmann, Tim Dockhorn, Sumith Kulal, Daniel Mendelevitch, Maciej Kilian, Dominik Lorenz, Yam Levi, Zion English, Vikram Voleti, Adam Letts, Varun Jampani, Robin Rombach.',g,S,fe="The abstract from the paper is:",K,T,de="<em>We present Stable Video Diffusion - a latent video diffusion model for high-resolution, state-of-the-art text-to-video and image-to-video generation. Recently, latent diffusion models trained for 2D image synthesis have been turned into generative video models by inserting temporal layers and finetuning them on small, high-quality video datasets. However, training methods in the literature vary widely, and the field has yet to agree on a unified strategy for curating video data. In this paper, we identify and evaluate three different stages for successful training of video LDMs: text-to-image pretraining, video pretraining, and high-quality video finetuning. Furthermore, we demonstrate the necessity of a well-curated pretraining dataset for generating high-quality videos and present a systematic curation process to train a strong base model, including captioning and filtering strategies. We then explore the impact of finetuning our base model on high-quality data and train a text-to-video model that is competitive with closed-source video generation. We also show that our base model provides a powerful motion representation for downstream tasks such as image-to-video generation and adaptability to camera motion-specific LoRA modules. Finally, we demonstrate that our model provides a strong multi-view 3D-prior and can serve as a base to finetune a multi-view diffusion model that jointly generates multiple views of objects in a feedforward fashion, outperforming image-based methods at a fraction of their compute budget. We release code and model weights at this https URL.</em>",q,_,N,C,F,M,ue="Video generation is memory-intensive and one way to reduce your memory usage is to set <code>enable_forward_chunking</code> on the pipeline’s UNet so you don’t run the entire feedforward layer at once. Breaking it up into chunks in a loop is more efficient.",G,I,pe='Check out the <a href="text-img2vid">Text or image-to-video</a> guide for more details about how certain parameters can affect video generation and how to optimize inference by reducing memory usage.',J,H,Y,p,k,se,j,me="Pipeline to generate video from an input image using Stable Video Diffusion.",re,B,ce=`This model inherits from <a href="/docs/diffusers/pr_12087/en/api/pipelines/overview#diffusers.DiffusionPipeline">DiffusionPipeline</a>. Check the superclass documentation for the generic methods
	implemented for all pipelines (downloading, saving, running on a particular device, etc.).`,Z,E,Q,b,A,le,R,ge="Output class for Stable Video Diffusion pipeline.",X,z,ee,U,te;return d=new ae({props:{title:"Stable Video Diffusion",local:"stable-video-diffusion",headingTag:"h1"}}),_=new ye({props:{$$slots:{default:[Le]},$$scope:{ctx:W}}}),C=new ae({props:{title:"Tips",local:"tips",headingTag:"h2"}}),H=new ae({props:{title:"StableVideoDiffusionPipeline",local:"diffusers.StableVideoDiffusionPipeline",headingTag:"h2"}}),k=new he({props:{name:"class diffusers.StableVideoDiffusionPipeline",anchor:"diffusers.StableVideoDiffusionPipeline",parameters:[{name:"vae",val:": AutoencoderKLTemporalDecoder"},{name:"image_encoder",val:": CLIPVisionModelWithProjection"},{name:"unet",val:": UNetSpatioTemporalConditionModel"},{name:"scheduler",val:": EulerDiscreteScheduler"},{name:"feature_extractor",val:": CLIPImageProcessor"}],parametersDescription:[{anchor:"diffusers.StableVideoDiffusionPipeline.vae",description:`<strong>vae</strong> (<code>AutoencoderKLTemporalDecoder</code>) —
	Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.`,name:"vae"},{anchor:"diffusers.StableVideoDiffusionPipeline.image_encoder",description:`<strong>image_encoder</strong> (<a href="https://huggingface.co/docs/transformers/main/en/model_doc/clip#transformers.CLIPVisionModelWithProjection" rel="nofollow">CLIPVisionModelWithProjection</a>) —
	Frozen CLIP image-encoder
	(<a href="https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K" rel="nofollow">laion/CLIP-ViT-H-14-laion2B-s32B-b79K</a>).`,name:"image_encoder"},{anchor:"diffusers.StableVideoDiffusionPipeline.unet",description:`<strong>unet</strong> (<code>UNetSpatioTemporalConditionModel</code>) —
	A <code>UNetSpatioTemporalConditionModel</code> to denoise the encoded image latents.`,name:"unet"},{anchor:"diffusers.StableVideoDiffusionPipeline.scheduler",description:`<strong>scheduler</strong> (<a href="/docs/diffusers/pr_12087/en/api/schedulers/euler#diffusers.EulerDiscreteScheduler">EulerDiscreteScheduler</a>) —
	A scheduler to be used in combination with <code>unet</code> to denoise the encoded image latents.`,name:"scheduler"},{anchor:"diffusers.StableVideoDiffusionPipeline.feature_extractor",description:`<strong>feature_extractor</strong> (<a href="https://huggingface.co/docs/transformers/main/en/model_doc/clip#transformers.CLIPImageProcessor" rel="nofollow">CLIPImageProcessor</a>) —
	A <code>CLIPImageProcessor</code> to extract features from generated images.`,name:"feature_extractor"}],source:"https://github.com/huggingface/diffusers/blob/vr_12087/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py#L147"}}),E=new ae({props:{title:"StableVideoDiffusionPipelineOutput",local:"diffusers.pipelines.stable_video_diffusion.StableVideoDiffusionPipelineOutput",headingTag:"h2"}}),A=new he({props:{name:"class diffusers.pipelines.stable_video_diffusion.StableVideoDiffusionPipelineOutput",anchor:"diffusers.pipelines.stable_video_diffusion.StableVideoDiffusionPipelineOutput",parameters:[{name:"frames",val:": typing.Union[typing.List[typing.List[PIL.Image.Image]], numpy.ndarray, torch.Tensor]"}],parametersDescription:[{anchor:"diffusers.pipelines.stable_video_diffusion.StableVideoDiffusionPipelineOutput.frames",description:`<strong>frames</strong> (<code>[List[List[PIL.Image.Image]]</code>, <code>np.ndarray</code>, <code>torch.Tensor</code>]) —
	List of denoised PIL images of length <code>batch_size</code> or numpy array or torch tensor of shape <code>(batch_size, num_frames, height, width, num_channels)</code>.`,name:"frames"}],source:"https://github.com/huggingface/diffusers/blob/vr_12087/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py#L133"}}),z=new De({props:{source:"https://github.com/huggingface/diffusers/blob/main/docs/source/en/api/pipelines/stable_diffusion/svd.md"}}),{c(){s=l("meta"),V=o(),m=l("p"),v=o(),P(d.$$.fragment),u=o(),c=l("p"),c.innerHTML=r,g=o(),S=l("p"),S.textContent=fe,K=o(),T=l("p"),T.innerHTML=de,q=o(),P(_.$$.fragment),N=o(),P(C.$$.fragment),F=o(),M=l("p"),M.innerHTML=ue,G=o(),I=l("p"),I.innerHTML=pe,J=o(),P(H.$$.fragment),Y=o(),p=l("div"),P(k.$$.fragment),se=o(),j=l("p"),j.textContent=me,re=o(),B=l("p"),B.innerHTML=ce,Z=o(),P(E.$$.fragment),Q=o(),b=l("div"),P(A.$$.fragment),le=o(),R=l("p"),R.textContent=ge,X=o(),P(z.$$.fragment),ee=o(),U=l("p"),this.h()},l(e){const t=we("svelte-u9bgzb",document.head);s=f(t,"META",{name:!0,content:!0}),t.forEach(i),V=a(e),m=f(e,"P",{}),ne(m).forEach(i),v=a(e),w(d.$$.fragment,e),u=a(e),c=f(e,"P",{"data-svelte-h":!0}),h(c)!=="svelte-oc62j3"&&(c.innerHTML=r),g=a(e),S=f(e,"P",{"data-svelte-h":!0}),h(S)!=="svelte-1cwsb16"&&(S.textContent=fe),K=a(e),T=f(e,"P",{"data-svelte-h":!0}),h(T)!=="svelte-13czz1p"&&(T.innerHTML=de),q=a(e),w(_.$$.fragment,e),N=a(e),w(C.$$.fragment,e),F=a(e),M=f(e,"P",{"data-svelte-h":!0}),h(M)!=="svelte-op8wgu"&&(M.innerHTML=ue),G=a(e),I=f(e,"P",{"data-svelte-h":!0}),h(I)!=="svelte-1lb9rti"&&(I.innerHTML=pe),J=a(e),w(H.$$.fragment,e),Y=a(e),p=f(e,"DIV",{class:!0});var $=ne(p);w(k.$$.fragment,$),se=a($),j=f($,"P",{"data-svelte-h":!0}),h(j)!=="svelte-ilygt8"&&(j.textContent=me),re=a($),B=f($,"P",{"data-svelte-h":!0}),h(B)!=="svelte-hwpwv0"&&(B.innerHTML=ce),$.forEach(i),Z=a(e),w(E.$$.fragment,e),Q=a(e),b=f(e,"DIV",{class:!0});var ie=ne(b);w(A.$$.fragment,ie),le=a(ie),R=f(ie,"P",{"data-svelte-h":!0}),h(R)!=="svelte-1xag8sx"&&(R.textContent=ge),ie.forEach(i),X=a(e),w(z.$$.fragment,e),ee=a(e),U=f(e,"P",{}),ne(U).forEach(i),this.h()},h(){oe(s,"name","hf:doc:metadata"),oe(s,"content",Ve),oe(p,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),oe(b,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8")},m(e,t){O(document.head,s),n(e,V,t),n(e,m,t),n(e,v,t),y(d,e,t),n(e,u,t),n(e,c,t),n(e,g,t),n(e,S,t),n(e,K,t),n(e,T,t),n(e,q,t),y(_,e,t),n(e,N,t),y(C,e,t),n(e,F,t),n(e,M,t),n(e,G,t),n(e,I,t),n(e,J,t),y(H,e,t),n(e,Y,t),n(e,p,t),y(k,p,null),O(p,se),O(p,j),O(p,re),O(p,B),n(e,Z,t),y(E,e,t),n(e,Q,t),n(e,b,t),y(A,b,null),O(b,le),O(b,R),n(e,X,t),y(z,e,t),n(e,ee,t),n(e,U,t),te=!0},p(e,[t]){const $={};t&2&&($.$$scope={dirty:t,ctx:e}),_.$set($)},i(e){te\|\|(D(d.$$.fragment,e),D(_.$$.fragment,e),D(C.$$.fragment,e),D(H.$$.fragment,e),D(k.$$.fragment,e),D(E.$$.fragment,e),D(A.$$.fragment,e),D(z.$$.fragment,e),te=!0)},o(e){L(d.$$.fragment,e),L(_.$$.fragment,e),L(C.$$.fragment,e),L(H.$$.fragment,e),L(k.$$.fragment,e),L(E.$$.fragment,e),L(A.$$.fragment,e),L(z.$$.fragment,e),te=!1},d(e){e&&(i(V),i(m),i(v),i(u),i(c),i(g),i(S),i(K),i(T),i(q),i(N),i(F),i(M),i(G),i(I),i(J),i(Y),i(p),i(Z),i(Q),i(b),i(X),i(ee),i(U)),i(s),x(d,e),x(_,e),x(C,e),x(H,e),x(k),x(E,e),x(A),x(z,e)}}}const Ve='{"title":"Stable Video Diffusion","local":"stable-video-diffusion","sections":[{"title":"Tips","local":"tips","sections":[],"depth":2},{"title":"StableVideoDiffusionPipeline","local":"diffusers.StableVideoDiffusionPipeline","sections":[],"depth":2},{"title":"StableVideoDiffusionPipelineOutput","local":"diffusers.pipelines.stable_video_diffusion.StableVideoDiffusionPipelineOutput","sections":[],"depth":2}],"depth":1}';function Se(W){return be(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class ke extends _e{constructor(s){super(),Pe(this,s,Se,xe,ve,{})}}export{ke as component};

Xet Storage Details

Size:: 11.6 kB
Xet hash:: b4126ae391f307c3aef6336d81a20f6b6a7e48c6eaf6e16aa0beee37320ca6cc

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.