Buckets:

rtrm's picture
download
raw
59.5 kB
import{s as ms,o as Js,n as x}from"../chunks/scheduler.e4ff9b64.js";import{S as us,i as ys,e as y,s as c,c as b,h as ds,a as d,d as e,b as r,f as rs,g as Z,j as h,k as rl,l as Us,m as n,n as B,t as _,o as I,p as C}from"../chunks/index.09f1bca0.js";import{C as js,H as yl,E as hs}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.bbad1169.js";import{C as Q}from"../chunks/CodeBlock.f8309f3f.js";import{H as Nl,a as N}from"../chunks/HfOption.44827c7f.js";function fs(G){let t,J="要在 FLUX.1-dev 上应用第一块缓存,请调用 <code>apply_cache_on_pipe</code>,如下所示。0.08 是 FLUX 模型的默认残差差异值。",p,o,s,a,U='<thead><tr><th>优化</th> <th>原始</th> <th>FBCache rdt=0.06</th> <th>FBCache rdt=0.08</th> <th>FBCache rdt=0.10</th> <th>FBCache rdt=0.12</th></tr></thead> <tbody><tr><td>预览</td> <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/flux-original.png" alt="Original"/></td> <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/flux-fbc-0.06.png" alt="FBCache rdt=0.06"/></td> <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/flux-fbc-0.08.png" alt="FBCache rdt=0.08"/></td> <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/flux-fbc-0.10.png" alt="FBCache rdt=0.10"/></td> <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/flux-fbc-0.12.png" alt="FBCache rdt=0.12"/></td></tr> <tr><td>墙时间 (s)</td> <td>26.36</td> <td>21.83</td> <td>17.01</td> <td>16.00</td> <td>13.78</td></tr></tbody>',f,m,g="First Block Cache 将推理速度降低到 17.01 秒,与基线相比,或快 1.55 倍,同时保持几乎零质量损失。",T;return o=new Q({props:{code:"aW1wb3J0JTIwdGltZSUwQWltcG9ydCUyMHRvcmNoJTBBZnJvbSUyMGRpZmZ1c2VycyUyMGltcG9ydCUyMEZsdXhQaXBlbGluZSUwQSUwQXBpcGUlMjAlM0QlMjBGbHV4UGlwZWxpbmUuZnJvbV9wcmV0cmFpbmVkKCUwQSUyMCUyMCUyMCUyMCUyMmJsYWNrLWZvcmVzdC1sYWJzJTJGRkxVWC4xLWRldiUyMiUyQyUwQSUyMCUyMCUyMCUyMHRvcmNoX2R0eXBlJTNEdG9yY2guYmZsb2F0MTYlMkMlMEEpLnRvKCUyMmN1ZGElMjIpJTBBJTBBZnJvbSUyMHBhcmFfYXR0bi5maXJzdF9ibG9ja19jYWNoZS5kaWZmdXNlcnNfYWRhcHRlcnMlMjBpbXBvcnQlMjBhcHBseV9jYWNoZV9vbl9waXBlJTBBJTBBYXBwbHlfY2FjaGVfb25fcGlwZShwaXBlJTJDJTIwcmVzaWR1YWxfZGlmZl90aHJlJTBBc2hvbGQlM0QwLjA4KSUwQSUwQSUyMyUyMCVFNSU5MCVBRiVFNyU5NCVBOCVFNSU4NiU4NSVFNSVBRCU5OCVFOCU4QSU4MiVFNyU5QyU4MSUwQSUyMyUyMHBpcGUuZW5hYmxlX21vZGVsX2NwdV9vZmZsb2FkKCklMEElMjMlMjBwaXBlLmVuYWJsZV9zZXF1ZW50aWFsX2NwdV9vZmZsb2FkKCklMEElMEFiZWdpbiUyMCUzRCUyMHRpbWUudGltZSgpJTBBaW1hZ2UlMjAlM0QlMjBwaXBlKCUwQSUyMCUyMCUyMCUyMCUyMkElMjBjYXQlMjBob2xkaW5nJTIwYSUyMHNpZ24lMjB0aGF0JTIwc2F5cyUyMGhlbGxvJTIwd29ybGQlMjIlMkMlMEElMjAlMjAlMjAlMjBudW1faW5mZXJlbmNlX3N0ZXBzJTNEMjglMkMlMEEpLmltYWdlcyU1QjAlNUQlMEFlbmQlMjAlM0QlMjB0aW1lLnRpbWUoKSUwQXByaW50KGYlMjJUaW1lJTNBJTIwJTdCZW5kJTIwLSUyMGJlZ2luJTNBLjJmJTdEcyUyMiklMEElMEFwcmludCglMjJTYXZpbmclMjBpbWFnZSUyMHRvJTIwZmx1eC5wbmclMjIpJTBBaW1hZ2Uuc2F2ZSglMjJmbHV4LnBuZyUyMik=",highlighted:`<span class="hljs-keyword">import</span> time
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> FluxPipeline
pipe = FluxPipeline.from_pretrained(
<span class="hljs-string">&quot;black-forest-labs/FLUX.1-dev&quot;</span>,
torch_dtype=torch.bfloat16,
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-keyword">from</span> para_attn.first_block_cache.diffusers_adapters <span class="hljs-keyword">import</span> apply_cache_on_pipe
apply_cache_on_pipe(pipe, residual_diff_thre
shold=<span class="hljs-number">0.08</span>)
<span class="hljs-comment"># 启用内存节省</span>
<span class="hljs-comment"># pipe.enable_model_cpu_offload()</span>
<span class="hljs-comment"># pipe.enable_sequential_cpu_offload()</span>
begin = time.time()
image = pipe(
<span class="hljs-string">&quot;A cat holding a sign that says hello world&quot;</span>,
num_inference_steps=<span class="hljs-number">28</span>,
).images[<span class="hljs-number">0</span>]
end = time.time()
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Time: <span class="hljs-subst">{end - begin:<span class="hljs-number">.2</span>f}</span>s&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;Saving image to flux.png&quot;</span>)
image.save(<span class="hljs-string">&quot;flux.png&quot;</span>)`,wrap:!1}}),{c(){t=y("p"),t.innerHTML=J,p=c(),b(o.$$.fragment),s=c(),a=y("table"),a.innerHTML=U,f=c(),m=y("p"),m.textContent=g},l(u){t=d(u,"P",{"data-svelte-h":!0}),h(t)!=="svelte-13o107o"&&(t.innerHTML=J),p=r(u),Z(o.$$.fragment,u),s=r(u),a=d(u,"TABLE",{"data-svelte-h":!0}),h(a)!=="svelte-9gupjq"&&(a.innerHTML=U),f=r(u),m=d(u,"P",{"data-svelte-h":!0}),h(m)!=="svelte-59xgrt"&&(m.textContent=g)},m(u,W){n(u,t,W),n(u,p,W),B(o,u,W),n(u,s,W),n(u,a,W),n(u,f,W),n(u,m,W),T=!0},p:x,i(u){T||(_(o.$$.fragment,u),T=!0)},o(u){I(o.$$.fragment,u),T=!1},d(u){u&&(e(t),e(p),e(s),e(a),e(f),e(m)),C(o,u)}}}function ws(G){let t,J="要在 HunyuanVideo 上应用 First Block Cache,请使用 <code>apply_cache_on_pipe</code>,如下所示。0.06 是 HunyuanVideo 模型的默认残差差值。",p,o,s,a,U=`<source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/hunyuan-video-original.mp4" type="video/mp4"/>
您的浏览器不支持视频标签。`,f,m,g="HunyuanVideo 无 FBCache",T,u,W=`<source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/hunyuan-video-fbc.mp4" type="video/mp4"/>
Your browser does not support the video tag.`,M,w,ml="HunyuanVideo 与 FBCache",X,k,Jl="First Block Cache 将推理速度降低至 2271.06 秒,相比基线快了 1.62 倍,同时保持了几乎为零的质量损失。",v;return o=new Q({props:{code:"aW1wb3J0JTIwdGltZSUwQWltcG9ydCUyMHRvcmNoJTBBZnJvbSUyMGRpZmZ1c2VycyUyMGltcG9ydCUyMEh1bnl1YW5WaWRlb1BpcGVsaW5lJTJDJTIwSHVueXVhblZpZGVvVHJhbnNmb3JtZXIzRE1vZGVsJTBBZnJvbSUyMGRpZmZ1c2Vycy51dGlscyUyMGltcG9ydCUyMGV4cG9ydF90b192aWRlbyUwQSUwQW1vZGVsX2lkJTIwJTNEJTIwJTIydGVuY2VudCUyRkh1bnl1YW5WaWRlbyUyMiUwQXRyYW5zZm9ybWVyJTIwJTNEJTIwSHVueXVhblZpZGVvVHJhbnNmb3JtZXIzRE1vZGVsLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjBtb2RlbF9pZCUyQyUwQSUyMCUyMCUyMCUyMHN1YmZvbGRlciUzRCUyMnRyYW5zZm9ybWVyJTIyJTJDJTBBJTIwJTIwJTIwJTIwdG9yY2hfZHR5cGUlM0R0b3JjaC5iZmxvYXQxNiUyQyUwQSUyMCUyMCUyMCUyMHJldmlzaW9uJTNEJTIycmVmcyUyRnByJTJGMTglMjIlMkMlMEEpJTBBcGlwZSUyMCUzRCUyMEh1bnl1YW5WaWRlb1BpcGVsaW5lLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjBtb2RlbF9pZCUyQyUwQSUyMCUyMCUyMCUyMHRyYW5zZm9ybWVyJTNEdHJhbnNmb3JtZXIlMkMlMEElMjAlMjAlMjAlMjB0b3JjaF9kdHlwZSUzRHRvcmNoLmZsb2F0MTYlMkMlMEElMjAlMjAlMjAlMjByZXZpc2lvbiUzRCUyMnJlZnMlMkZwciUyRjE4JTIyJTJDJTBBKS50byglMjJjdWRhJTIyKSUwQSUwQWZyb20lMjBwYXJhX2F0dG4uZmlyc3RfYmxvY2tfY2FjaGUuZGlmZnVzZXJzX2FkYXB0ZXJzJTIwaW1wb3J0JTIwYXBwbHlfY2FjaGVfb25fcGlwZSUwQSUwQWFwcGx5X2NhY2hlX29uX3BpcGUocGlwZSUyQyUyMHJlc2lkdWFsX2RpZmZfdGhyZXNob2xkJTNEMC42KSUwQSUwQXBpcGUudmFlLmVuYWJsZV90aWxpbmcoKSUwQSUwQWJlZ2luJTIwJTNEJTIwdGltZS50aW1lKCklMEFvdXRwdXQlMjAlM0QlMjBwaXBlKCUwQSUyMCUyMCUyMCUyMHByb21wdCUzRCUyMkElMjBjYXQlMjB3YWxrcyUyMG9uJTIwdGhlJTIwZ3Jhc3MlMkMlMjByZWFsaXN0aWMlMjIlMkMlMEElMjAlMjAlMjAlMjBoZWlnaHQlM0Q3MjAlMkMlMEElMjAlMjAlMjAlMjB3aWR0aCUzRDEyODAlMkMlMEElMjAlMjAlMjAlMjBudW1fZnJhbWVzJTNEMTI5JTJDJTBBJTIwJTIwJTIwJTIwbnVtX2luZmVyZW5jZV9zdGVwcyUzRDMwJTJDJTBBKS5mcmFtZXMlNUIwJTVEJTBBZW5kJTIwJTNEJTIwdGltZS50aW1lKCklMEFwcmludChmJTIyVGltZSUzQSUyMCU3QmVuZCUyMC0lMjBiZWdpbiUzQS4yZiU3RHMlMjIpJTBBJTBBcHJpbnQoJTIyU2F2aW5nJTIwdmlkZW8lMjB0byUyMGh1bnl1YW5fdmlkZW8ubXA0JTIyKSUwQWV4cG9ydF90b192aWRlbyhvdXRwdXQlMkMlMjAlMjJodW55dWFuX3ZpZGVvLm1wNCUyMiUyQyUyMGZwcyUzRDE1KQ==",highlighted:`<span class="hljs-keyword">import</span> time
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
<span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> export_to_video
model_id = <span class="hljs-string">&quot;tencent/HunyuanVideo&quot;</span>
transformer = HunyuanVideoTransformer3DModel.from_pretrained(
model_id,
subfolder=<span class="hljs-string">&quot;transformer&quot;</span>,
torch_dtype=torch.bfloat16,
revision=<span class="hljs-string">&quot;refs/pr/18&quot;</span>,
)
pipe = HunyuanVideoPipeline.from_pretrained(
model_id,
transformer=transformer,
torch_dtype=torch.float16,
revision=<span class="hljs-string">&quot;refs/pr/18&quot;</span>,
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-keyword">from</span> para_attn.first_block_cache.diffusers_adapters <span class="hljs-keyword">import</span> apply_cache_on_pipe
apply_cache_on_pipe(pipe, residual_diff_threshold=<span class="hljs-number">0.6</span>)
pipe.vae.enable_tiling()
begin = time.time()
output = pipe(
prompt=<span class="hljs-string">&quot;A cat walks on the grass, realistic&quot;</span>,
height=<span class="hljs-number">720</span>,
width=<span class="hljs-number">1280</span>,
num_frames=<span class="hljs-number">129</span>,
num_inference_steps=<span class="hljs-number">30</span>,
).frames[<span class="hljs-number">0</span>]
end = time.time()
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Time: <span class="hljs-subst">{end - begin:<span class="hljs-number">.2</span>f}</span>s&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;Saving video to hunyuan_video.mp4&quot;</span>)
export_to_video(output, <span class="hljs-string">&quot;hunyuan_video.mp4&quot;</span>, fps=<span class="hljs-number">15</span>)`,wrap:!1}}),{c(){t=y("p"),t.innerHTML=J,p=c(),b(o.$$.fragment),s=c(),a=y("video"),a.innerHTML=U,f=c(),m=y("small"),m.textContent=g,T=c(),u=y("video"),u.innerHTML=W,M=c(),w=y("small"),w.textContent=ml,X=c(),k=y("p"),k.textContent=Jl,this.h()},l(j){t=d(j,"P",{"data-svelte-h":!0}),h(t)!=="svelte-1enthpa"&&(t.innerHTML=J),p=r(j),Z(o.$$.fragment,j),s=r(j),a=d(j,"VIDEO",{"data-svelte-h":!0}),h(a)!=="svelte-7ylzx3"&&(a.innerHTML=U),f=r(j),m=d(j,"SMALL",{"data-svelte-h":!0}),h(m)!=="svelte-1xtgkhf"&&(m.textContent=g),T=r(j),u=d(j,"VIDEO",{"data-svelte-h":!0}),h(u)!=="svelte-1lwel73"&&(u.innerHTML=W),M=r(j),w=d(j,"SMALL",{"data-svelte-h":!0}),h(w)!=="svelte-1omedp"&&(w.textContent=ml),X=r(j),k=d(j,"P",{"data-svelte-h":!0}),h(k)!=="svelte-b3qx2r"&&(k.textContent=Jl),this.h()},h(){a.controls="",u.controls=""},m(j,V){n(j,t,V),n(j,p,V),B(o,j,V),n(j,s,V),n(j,a,V),n(j,f,V),n(j,m,V),n(j,T,V),n(j,u,V),n(j,M,V),n(j,w,V),n(j,X,V),n(j,k,V),v=!0},p:x,i(j){v||(_(o.$$.fragment,j),v=!0)},o(j){I(o.$$.fragment,j),v=!1},d(j){j&&(e(t),e(p),e(s),e(a),e(f),e(m),e(T),e(u),e(M),e(w),e(X),e(k)),C(o,j)}}}function Ts(G){let t,J,p,o;return t=new N({props:{id:"first-block-cache",option:"FLUX-1.dev",$$slots:{default:[fs]},$$scope:{ctx:G}}}),p=new N({props:{id:"first-block-cache",option:"HunyuanVideo",$$slots:{default:[ws]},$$scope:{ctx:G}}}),{c(){b(t.$$.fragment),J=c(),b(p.$$.fragment)},l(s){Z(t.$$.fragment,s),J=r(s),Z(p.$$.fragment,s)},m(s,a){B(t,s,a),n(s,J,a),B(p,s,a),o=!0},p(s,a){const U={};a&2&&(U.$$scope={dirty:a,ctx:s}),t.$set(U);const f={};a&2&&(f.$$scope={dirty:a,ctx:s}),p.$set(f)},i(s){o||(_(t.$$.fragment,s),_(p.$$.fragment,s),o=!0)},o(s){I(t.$$.fragment,s),I(p.$$.fragment,s),o=!1},d(s){s&&e(J),C(t,s),C(p,s)}}}function bs(G){let t,J,p,o="fp8 动态量化和 torch.compile 将推理速度降低至 7.56 秒,相比基线快了 3.48 倍。",s;return t=new Q({props:{code:"aW1wb3J0JTIwdGltZSUwQWltcG9ydCUyMHRvcmNoJTBBZnJvbSUyMGRpZmZ1c2VycyUyMGltcG9ydCUyMEZsdXhQaXBlbGluZSUwQSUwQXBpcGUlMjAlM0QlMjBGbHV4UGlwZWxpbmUuZnJvbV9wcmV0cmFpbmVkKCUwQSUyMCUyMCUyMCUyMCUyMmJsYWNrLWZvcmVzdC1sYWJzJTJGRkxVWC4xLWRldiUyMiUyQyUwQSUyMCUyMCUyMCUyMHRvcmNoX2R0eXBlJTNEdG9yY2guYmZsb2F0MTYlMkMlMEEpLnRvKCUyMmN1ZGElMjIpJTBBJTBBZnJvbSUyMHBhcmFfYXR0bi5maXJzdF9ibG9ja19jYWNoZS5kaWZmdXNlcnNfYWRhcHRlcnMlMjBpbXBvcnQlMjBhcHBseV9jYWNoZV9vbl9waXBlJTBBJTBBYXBwbHlfY2FjaGVfb25fcGlwZSglMEElMjAlMjAlMjAlMjBwaXBlJTJDJTBBJTIwJTIwJTIwJTIwcmVzaWR1YWxfZGlmZl90aHJlc2hvbGQlM0QwLjEyJTJDJTIwJTIwJTIzJTIwJUU0JUJEJUJGJUU3JTk0JUE4JUU2JTlCJUI0JUU1JUE0JUE3JUU3JTlBJTg0JUU1JTgwJUJDJUU0JUJCJUE1JUU0JUJEJUJGJUU3JUJDJTkzJUU1JUFEJTk4JUU3JTk0JTlGJUU2JTk1JTg4JTBBKSUwQSUwQWZyb20lMjB0b3JjaGFvLnF1YW50aXphdGlvbiUyMGltcG9ydCUyMHF1YW50aXplXyUyQyUyMGZsb2F0OF9keW5hbWljX2FjdGl2YXRpb25fZmxvYXQ4X3dlaWdodCUyQyUyMGZsb2F0OF93ZWlnaHRfb25seSUwQSUwQXF1YW50aXplXyhwaXBlLnRleHRfZW5jb2RlciUyQyUyMGZsb2F0OF93ZWlnaHRfb25seSgpKSUwQXF1YW50aXplXyhwaXBlLnRyYW5zZm9ybWVyJTJDJTIwZmxvYXQ4X2R5bmFtaWNfYWN0aXZhdGlvbl9mbG9hdDhfd2VpZ2h0KCkpJTBBcGlwZS50cmFuc2Zvcm1lciUyMCUzRCUyMHRvcmNoLmNvbXBpbGUoJTBBJTIwJTIwJTIwcGlwZS50cmFuc2Zvcm1lciUyQyUyMG1vZGUlM0QlMjJtYXgtYXV0b3R1bmUtbm8tY3VkYWdyYXBocyUyMiUyQyUwQSklMEElMEElMjMlMjAlRTUlOTAlQUYlRTclOTQlQTglRTUlODYlODUlRTUlQUQlOTglRTglOEElODIlRTclOUMlODElMEElMjMlMjBwaXBlLmVuYWJsZV9tb2RlbF9jcHVfb2ZmbG9hZCgpJTBBJTIzJTIwcGlwZS5lbmFibGVfc2VxdWVudGlhbF9jcHVfb2ZmbG9hZCgpJTBBJTBBZm9yJTIwaSUyMGluJTIwcmFuZ2UoMiklM0ElMEElMjAlMjAlMjAlMjBiZWdpbiUyMCUzRCUyMHRpbWUudGltZSgpJTBBJTIwJTIwJTIwJTIwaW1hZ2UlMjAlM0QlMjBwaXBlKCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMkElMjBjYXQlMjBob2xkaW5nJTIwYSUyMHNpZ24lMjB0aGF0JTIwc2F5cyUyMGhlbGxvJTIwd29ybGQlMjIlMkMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBudW1faW5mZXJlbmNlX3N0ZXBzJTNEMjglMkMlMEElMjAlMjAlMjAlMjApLmltYWdlcyU1QjAlNUQlMEElMjAlMjAlMjAlMjBlbmQlMjAlM0QlMjB0aW1lLnRpbWUoKSUwQSUyMCUyMCUyMCUyMGlmJTIwaSUyMCUzRCUzRCUyMDAlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBwcmludChmJTIyJUU5JUEyJTg0JUU3JTgzJUFEJUU2JTk3JUI2JUU5JTk3JUI0JTNBJTIwJTdCZW5kJTIwLSUyMGJlZ2luJTNBLjJmJTdEcyUyMiklMEElMjAlMjAlMjAlMjBlbHNlJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwcHJpbnQoZiUyMiVFNiU5NyVCNiVFOSU5NyVCNCUzQSUyMCU3QmVuZCUyMC0lMjBiZWdpbiUzQS4yZiU3RHMlMjIpJTBBJTBBcHJpbnQoJTIyJUU0JUJGJTlEJUU1JUFEJTk4JUU1JTlCJUJFJUU1JTgzJThGJUU1JTg4JUIwJTIwZmx1eC5wbmclMjIpJTBBaW1hZ2Uuc2F2ZSglMjJmbHV4LnBuZyUyMik=",highlighted:`<span class="hljs-keyword">import</span> time
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> FluxPipeline
pipe = FluxPipeline.from_pretrained(
<span class="hljs-string">&quot;black-forest-labs/FLUX.1-dev&quot;</span>,
torch_dtype=torch.bfloat16,
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-keyword">from</span> para_attn.first_block_cache.diffusers_adapters <span class="hljs-keyword">import</span> apply_cache_on_pipe
apply_cache_on_pipe(
pipe,
residual_diff_threshold=<span class="hljs-number">0.12</span>, <span class="hljs-comment"># 使用更大的值以使缓存生效</span>
)
<span class="hljs-keyword">from</span> torchao.quantization <span class="hljs-keyword">import</span> quantize_, float8_dynamic_activation_float8_weight, float8_weight_only
quantize_(pipe.text_encoder, float8_weight_only())
quantize_(pipe.transformer, float8_dynamic_activation_float8_weight())
pipe.transformer = torch.<span class="hljs-built_in">compile</span>(
pipe.transformer, mode=<span class="hljs-string">&quot;max-autotune-no-cudagraphs&quot;</span>,
)
<span class="hljs-comment"># 启用内存节省</span>
<span class="hljs-comment"># pipe.enable_model_cpu_offload()</span>
<span class="hljs-comment"># pipe.enable_sequential_cpu_offload()</span>
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">2</span>):
begin = time.time()
image = pipe(
<span class="hljs-string">&quot;A cat holding a sign that says hello world&quot;</span>,
num_inference_steps=<span class="hljs-number">28</span>,
).images[<span class="hljs-number">0</span>]
end = time.time()
<span class="hljs-keyword">if</span> i == <span class="hljs-number">0</span>:
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;预热时间: <span class="hljs-subst">{end - begin:<span class="hljs-number">.2</span>f}</span>s&quot;</span>)
<span class="hljs-keyword">else</span>:
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;时间: <span class="hljs-subst">{end - begin:<span class="hljs-number">.2</span>f}</span>s&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;保存图像到 flux.png&quot;</span>)
image.save(<span class="hljs-string">&quot;flux.png&quot;</span>)`,wrap:!1}}),{c(){b(t.$$.fragment),J=c(),p=y("p"),p.textContent=o},l(a){Z(t.$$.fragment,a),J=r(a),p=d(a,"P",{"data-svelte-h":!0}),h(p)!=="svelte-vnhv5y"&&(p.textContent=o)},m(a,U){B(t,a,U),n(a,J,U),n(a,p,U),s=!0},p:x,i(a){s||(_(t.$$.fragment,a),s=!0)},o(a){I(t.$$.fragment,a),s=!1},d(a){a&&(e(J),e(p)),C(t,a)}}}function Zs(G){let t,J,p,o="NVIDIA L20 GPU 仅有 48GB 内存,在编译后且如果未调用 <code>enable_model_cpu_offload</code> 时,可能会遇到内存不足(OOM)错误,因为 HunyuanVideo 在高分辨率和大量帧数运行时具有非常大的激活张量。对于内存少于 80GB 的 GPU,可以尝试降低分辨率和帧数来避免 OOM 错误。",s,a,U="大型视频生成模型通常受注意力计算而非全连接层的瓶颈限制。这些模型不会从量化和 torch.compile 中显著受益。",f;return t=new Q({props:{code:"aW1wb3J0JTIwdGltZSUwQWltcG9ydCUyMHRvcmNoJTBBZnJvbSUyMGRpZmZ1c2VycyUyMGltcG9ydCUyMEh1bnl1YW5WaWRlb1BpcGVsaW5lJTJDJTIwSHVueXVhblZpZGVvVHJhbnNmb3JtZXIzRE1vZGVsJTBBZnJvbSUyMGRpZmZ1c2Vycy51dGlscyUyMGltcG9ydCUyMGV4cG9ydF90b192aWRlbyUwQSUwQW1vZGVsX2lkJTIwJTNEJTIwJTIydGVuY2VudCUyRkh1bnl1YW5WaWRlbyUyMiUwQXRyYW5zZm9ybWVyJTIwJTNEJTIwSHVueXVhblZpZGVvVHJhbnNmb3JtZXIzRE1vZGVsLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjBtb2RlbF9pZCUyQyUwQSUyMCUyMCUyMCUyMHN1YmZvbGRlciUzRCUyMnRyYW5zZm9ybWVyJTIyJTJDJTBBJTIwJTIwJTIwJTIwdG9yY2hfZHR5cGUlM0R0b3JjaC5iZmxvYXQxNiUyQyUwQSUyMCUyMCUyMCUyMHJldmlzaW9uJTNEJTIycmVmcyUyRnByJTJGMTglMjIlMkMlMEEpJTBBcGlwZSUyMCUzRCUyMEh1bnl1YW5WaWRlb1BpcGVsaW5lLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjBtb2RlbF9pZCUyQyUwQSUyMCUyMCUyMCUyMHRyYW5zZm9ybWVyJTNEdHJhbnNmb3JtZXIlMkMlMEElMjAlMjAlMjAlMjB0b3JjaF9kdHlwZSUzRHRvcmNoLmZsb2F0MTYlMkMlMEElMjAlMjAlMjAlMjByZXZpc2lvbiUzRCUyMnJlZnMlMkZwciUyRjE4JTIyJTJDJTBBKS50byglMjJjdWRhJTIyKSUwQSUwQWZyb20lMjBwYXJhX2F0dG4uZmlyc3RfYmxvY2tfY2FjaGUuZGlmZnVzZXJzX2FkYXB0ZXJzJTIwaW1wb3J0JTIwYXBwbHlfY2FjaGVfb25fcGlwZSUwQSUwQWFwcGx5X2NhY2hlX29uX3BpcGUocGlwZSklMEElMEFmcm9tJTIwdG9yY2hhby5xdWFudGl6YXRpb24lMjBpbXBvcnQlMjBxdWFudGl6ZV8lMkMlMjBmbG9hdDhfZHluYW1pY19hY3RpdmF0aW9uX2Zsb2F0OF93ZWlnaHQlMkMlMjBmbG9hdDhfd2VpZ2h0X29ubHklMEElMEFxdWFudGl6ZV8ocGlwZS50ZXh0X2VuY29kZXIlMkMlMjBmbG9hdDhfd2VpZ2h0X29ubHkoKSklMEFxdWFudGl6ZV8ocGlwZS50cmFuc2Zvcm1lciUyQyUyMGZsb2F0OF9keW5hbWljX2FjdGl2YXRpb25fZmxvYXQ4X3dlaWdodCgpKSUwQXBpcGUudHJhbnNmb3JtZXIlMjAlM0QlMjB0b3JjaC5jb21waWxlKCUwQSUyMCUyMCUyMHBpcGUudHJhbnNmb3JtZXIlMkMlMjBtb2RlJTNEJTIybWF4LWF1dG90dW5lLW5vLWN1ZGFncmFwaHMlMjIlMkMlMEEpJTBBJTBBJTIzJTIwRW5hYmxlJTIwbWVtb3J5JTIwc2F2aW5ncyUwQXBpcGUudmFlLmVuYWJsZV90aWxpbmcoKSUwQSUyMyUyMHBpcGUuZW5hYmxlX21vZGVsX2NwdV9vZmZsb2FkKCklMEElMjMlMjBwaXBlLmVuYWJsZV9zZXF1ZW50aWFsX2NwdV9vZmZsb2FkKCklMEElMEFmb3IlMjBpJTIwaW4lMjByYW5nZSgyKSUzQSUwQSUyMCUyMCUyMCUyMGJlZ2luJTIwJTNEJTIwdGltZS50aW1lKCklMEElMjAlMjAlMjAlMjBvdXRwdXQlMjAlM0QlMjBwaXBlKCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHByb21wdCUzRCUyMkElMjBjYXQlMjB3YWxrcyUyMG9uJTIwdGhlJTIwZ3Jhc3MlMkMlMjByZWFsaXN0aWMlMjIlMkMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBoZWlnaHQlM0Q3MjAlMkMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjB3aWR0aCUzRDEyODAlMkMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBudW1fZnJhbWVzJTNEMTI5JTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwbnVtX2luZmVyZW5jZV9zdGVwcyUzRDElMjBpZiUyMGklMjAlM0QlM0QlMjAwJTIwZWxzZSUyMDMwJTJDJTBBJTIwJTIwJTIwJTIwKS5mcmFtZXMlNUIwJTVEJTBBJTIwJTIwJTIwJTIwZW5kJTIwJTNEJTIwdGltZS50aW1lKCklMEElMjAlMjAlMjAlMjBpZiUyMGklMjAlM0QlM0QlMjAwJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwcHJpbnQoZiUyMldhcm0lMjB1cCUyMHRpbWUlM0ElMjAlN0JlbmQlMjAtJTIwYmVnaW4lM0EuMmYlN0RzJTIyKSUwQSUyMCUyMCUyMCUyMGVsc2UlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBwcmludChmJTIyVGltZSUzQSUyMCU3QmVuZCUyMC0lMjBiZWdpbiUzQS4yZiU3RHMlMjIpJTBBJTBBcHJpbnQoJTIyU2F2aW5nJTIwdmlkZW8lMjB0byUyMGh1bnl1YW5fdmlkZW8ubXA0JTIyKSUwQWV4cG9ydF90b192aWRlbyhvdXRwdXQlMkMlMjAlMjJodW55dWFuX3ZpZGVvLm1wNCUyMiUyQyUyMGZwcyUzRDE1KQ==",highlighted:`<span class="hljs-keyword">import</span> time
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
<span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> export_to_video
model_id = <span class="hljs-string">&quot;tencent/HunyuanVideo&quot;</span>
transformer = HunyuanVideoTransformer3DModel.from_pretrained(
model_id,
subfolder=<span class="hljs-string">&quot;transformer&quot;</span>,
torch_dtype=torch.bfloat16,
revision=<span class="hljs-string">&quot;refs/pr/18&quot;</span>,
)
pipe = HunyuanVideoPipeline.from_pretrained(
model_id,
transformer=transformer,
torch_dtype=torch.float16,
revision=<span class="hljs-string">&quot;refs/pr/18&quot;</span>,
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-keyword">from</span> para_attn.first_block_cache.diffusers_adapters <span class="hljs-keyword">import</span> apply_cache_on_pipe
apply_cache_on_pipe(pipe)
<span class="hljs-keyword">from</span> torchao.quantization <span class="hljs-keyword">import</span> quantize_, float8_dynamic_activation_float8_weight, float8_weight_only
quantize_(pipe.text_encoder, float8_weight_only())
quantize_(pipe.transformer, float8_dynamic_activation_float8_weight())
pipe.transformer = torch.<span class="hljs-built_in">compile</span>(
pipe.transformer, mode=<span class="hljs-string">&quot;max-autotune-no-cudagraphs&quot;</span>,
)
<span class="hljs-comment"># Enable memory savings</span>
pipe.vae.enable_tiling()
<span class="hljs-comment"># pipe.enable_model_cpu_offload()</span>
<span class="hljs-comment"># pipe.enable_sequential_cpu_offload()</span>
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">2</span>):
begin = time.time()
output = pipe(
prompt=<span class="hljs-string">&quot;A cat walks on the grass, realistic&quot;</span>,
height=<span class="hljs-number">720</span>,
width=<span class="hljs-number">1280</span>,
num_frames=<span class="hljs-number">129</span>,
num_inference_steps=<span class="hljs-number">1</span> <span class="hljs-keyword">if</span> i == <span class="hljs-number">0</span> <span class="hljs-keyword">else</span> <span class="hljs-number">30</span>,
).frames[<span class="hljs-number">0</span>]
end = time.time()
<span class="hljs-keyword">if</span> i == <span class="hljs-number">0</span>:
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Warm up time: <span class="hljs-subst">{end - begin:<span class="hljs-number">.2</span>f}</span>s&quot;</span>)
<span class="hljs-keyword">else</span>:
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Time: <span class="hljs-subst">{end - begin:<span class="hljs-number">.2</span>f}</span>s&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;Saving video to hunyuan_video.mp4&quot;</span>)
export_to_video(output, <span class="hljs-string">&quot;hunyuan_video.mp4&quot;</span>, fps=<span class="hljs-number">15</span>)`,wrap:!1}}),{c(){b(t.$$.fragment),J=c(),p=y("p"),p.innerHTML=o,s=c(),a=y("p"),a.textContent=U},l(m){Z(t.$$.fragment,m),J=r(m),p=d(m,"P",{"data-svelte-h":!0}),h(p)!=="svelte-7u4mbb"&&(p.innerHTML=o),s=r(m),a=d(m,"P",{"data-svelte-h":!0}),h(a)!=="svelte-12a4nr6"&&(a.textContent=U)},m(m,g){B(t,m,g),n(m,J,g),n(m,p,g),n(m,s,g),n(m,a,g),f=!0},p:x,i(m){f||(_(t.$$.fragment,m),f=!0)},o(m){I(t.$$.fragment,m),f=!1},d(m){m&&(e(J),e(p),e(s),e(a)),C(t,m)}}}function Bs(G){let t,J,p,o;return t=new N({props:{id:"fp8-quantization",option:"FLUX-1.dev",$$slots:{default:[bs]},$$scope:{ctx:G}}}),p=new N({props:{id:"fp8-quantization",option:"HunyuanVideo",$$slots:{default:[Zs]},$$scope:{ctx:G}}}),{c(){b(t.$$.fragment),J=c(),b(p.$$.fragment)},l(s){Z(t.$$.fragment,s),J=r(s),Z(p.$$.fragment,s)},m(s,a){B(t,s,a),n(s,J,a),B(p,s,a),o=!0},p(s,a){const U={};a&2&&(U.$$scope={dirty:a,ctx:s}),t.$set(U);const f={};a&2&&(f.$$scope={dirty:a,ctx:s}),p.$set(f)},i(s){o||(_(t.$$.fragment,s),_(p.$$.fragment,s),o=!0)},o(s){I(t.$$.fragment,s),I(p.$$.fragment,s),o=!1},d(s){s&&e(J),C(t,s),C(p,s)}}}function _s(G){let t,J="以下代码示例结合了第一块缓存、fp8动态量化、torch.compile和上下文并行,以实现最快的推理速度。",p,o,s,a,U='保存到<code>run_flux.py</code>并使用<a href="https://pytorch.org/docs/stable/elastic/run.html" rel="nofollow">torchrun</a>启动。',f,m,g,T,u="推理速度降至8.20秒,相比基线快了3.21倍,使用2个NVIDIA L20 GPU。在4个L20上,推理速度为3.90秒,快了6.75倍。",W;return o=new Q({props:{code:"aW1wb3J0JTIwdGltZSUwQWltcG9ydCUyMHRvcmNoJTBBaW1wb3J0JTIwdG9yY2guZGlzdHJpYnV0ZWQlMjBhcyUyMGRpc3QlMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwRmx1eFBpcGVsaW5lJTBBJTBBZGlzdC5pbml0X3Byb2Nlc3NfZ3JvdXAoKSUwQSUwQXRvcmNoLmN1ZGEuc2V0X2RldmljZShkaXN0LmdldF9yYW5rKCkpJTBBJTBBcGlwZSUyMCUzRCUyMEZsdXhQaXBlbGluZS5mcm9tX3ByZXRyYWluZWQoJTBBJTIwJTIwJTIwJTIwJTIyYmxhY2stZm9yZXN0LWxhYnMlMkZGTFVYLjEtZGV2JTIyJTJDJTBBJTIwJTIwJTIwJTIwdG9yY2hfZHR5cGUlM0R0b3JjaC5iZmxvYXQxNiUyQyUwQSkudG8oJTIyY3VkYSUyMiklMEElMEFmcm9tJTIwcGFyYV9hdHRuLmNvbnRleHRfcGFyYWxsZWwlMjBpbXBvcnQlMjBpbml0X2NvbnRleHRfcGFyYWxsZWxfbWVzaCUwQWZyb20lMjBwYXJhX2F0dG4uY29udGV4dF9wYXJhbGxlbC5kaWZmdXNlcnNfYWRhcHRlcnMlMjBpbXBvcnQlMjBwYXJhbGxlbGl6ZV9waXBlJTBBZnJvbSUyMHBhcmFfYXR0bi5wYXJhbGxlbF92YWUuZGlmZnVzZXJzX2FkYXB0ZXJzJTIwaW1wb3J0JTIwcGFyYWxsZWxpemVfdmFlJTBBJTBBbWVzaCUyMCUzRCUyMGluaXRfY29udGV4dF9wYXJhbGxlbF9tZXNoKCUwQSUyMCUyMCUyMCUyMHBpcGUuZGV2aWNlLnR5cGUlMkMlMEElMjAlMjAlMjAlMjBtYXhfcmluZ19kaW1fc2l6ZSUzRDIlMkMlMEEpJTBBcGFyYWxsZWxpemVfcGlwZSglMEElMjAlMjAlMjAlMjBwaXBlJTJDJTBBJTIwJTIwJTIwJTIwbWVzaCUzRG1lc2glMkMlMEEpJTBBcGFyYWxsZWxpemVfdmFlKHBpcGUudmFlJTJDJTIwbWVzaCUzRG1lc2guX2ZsYXR0ZW4oKSklMEElMEFmcm9tJTIwcGFyYV9hdHRuLmZpcnN0X2Jsb2NrX2NhY2hlLmRpZmZ1c2Vyc19hZGFwdGVycyUyMGltcG9ydCUyMGFwcGx5X2NhY2hlX29uX3BpcGUlMEElMEFhcHBseV9jYWNoZV9vbl9waXBlKCUwQSUyMCUyMCUyMCUyMHBpcGUlMkMlMEElMjAlMjAlMjAlMjByZXNpZHVhbF9kaWZmX3RocmVzaG9sZCUzRDAuMTIlMkMlMjAlMjAlMjMlMjAlRTQlQkQlQkYlRTclOTQlQTglRTglQkUlODMlRTUlQTQlQTclRTclOUElODQlRTUlODAlQkMlRTQlQkIlQTUlRTQlQkQlQkYlRTclQkMlOTMlRTUlQUQlOTglRTclOTQlOUYlRTYlOTUlODglMEEpJTBBJTBBZnJvbSUyMHRvcmNoYW8ucXVhbnRpemF0aW9uJTIwaW1wb3J0JTIwcXVhbnRpemVfJTJDJTIwZmxvYXQ4X2R5bmFtaWNfYWN0aXZhdGlvbl9mbG9hdDhfd2VpZ2h0JTJDJTIwZmxvYXQ4X3dlaWdodF9vbmx5JTBBJTBBcXVhbnRpemVfKHBpcGUudGV4dF9lbmNvZGVyJTJDJTIwZmxvYXQ4X3dlaWdodF9vbmx5KCkpJTBBcXVhbnRpemVfKHBpcGUudHJhbnNmb3JtZXIlMkMlMjBmbG9hdDhfZHluYW1pY19hY3RpdmF0aW9uX2Zsb2F0OF93ZWlnaHQoKSklMEF0b3JjaC5faW5kdWN0b3IuY29uZmlnLnJlb3JkZXJfZm9yX2NvbXB1dGVfY29tbV9vdmVybGFwJTIwJTNEJTIwVHJ1ZSUwQXBpcGUudHJhbnNmb3JtZXIlMjAlM0QlMjB0b3JjaC5jb21waWxlKCUwQSUyMCUyMCUyMHBpcGUudHJhbnNmb3JtZXIlMkMlMjBtb2RlJTNEJTIybWF4LWF1dG90dW5lLW5vLWN1ZGFncmFwaHMlMjIlMkMlMEEpJTBBJTBBJTIzJTIwJUU1JTkwJUFGJUU3JTk0JUE4JUU1JTg2JTg1JUU1JUFEJTk4JUU4JThBJTgyJUU3JTlDJTgxJTBBJTIzJTIwcGlwZS5lbmFibGVfbW9kZWxfY3B1X29mZmxvYWQoZ3B1X2lkJTNEZGlzdC5nZXRfcmFuaygpKSUwQSUyMyUyMHBpcGUuZW5hYmxlX3NlcXVlbnRpYWxfY3B1X29mZmxvYWQoZ3B1X2lkJTNEZGlzdC5nZXRfcmFuaygpKSUwQSUwQWZvciUyMGklMjBpbiUyMHJhbmdlKDIpJTNBJTBBJTIwJTIwJTIwJTIwYmVnaW4lMjAlM0QlMjB0aW1lLnRpbWUoKSUwQSUyMCUyMCUyMCUyMGltYWdlJTIwJTNEJTIwcGlwZSglMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjJBJTIwY2F0JTIwaG9sZGluZyUyMGElMjBzaWduJTIwdGhhdCUyMHNheXMlMjBoZWxsbyUyMHdvcmxkJTIyJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwbnVtX2luZmVyZW5jZV9zdGVwcyUzRDI4JTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwb3V0cHV0X3R5cGUlM0QlMjJwaWwlMjIlMjBpZiUyMGRpc3QuZ2V0X3JhbmsoKSUyMCUzRCUzRCUyMDAlMjBlbHNlJTIwJTIycHQlMjIlMkMlMEElMjAlMjAlMjAlMjApLmltYWdlcyU1QjAlNUQlMEElMjAlMjAlMjAlMjBlbmQlMjAlM0QlMjB0aW1lLnRpbWUoKSUwQSUyMCUyMCUyMCUyMGlmJTIwZGlzdC5nZXRfcmFuaygpJTIwJTNEJTNEJTIwMCUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGlmJTIwaSUyMCUzRCUzRCUyMDAlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBwcmludChmJTIyJUU5JUEyJTg0JUU3JTgzJUFEJUU2JTk3JUI2JUU5JTk3JUI0JTNBJTIwJTdCZW5kJTIwLSUyMGJlZ2luJTNBLjJmJTdEcyUyMiklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBlbHNlJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwcHJpbnQoZiUyMiVFNiU5NyVCNiVFOSU5NyVCNCUzQSUyMCU3QmVuZCUyMC0lMjBiZWdpbiUzQS4yZiU3RHMlMjIpJTBBJTBBaWYlMjBkaXN0LmdldF9yYW5rKCklMjAlM0QlM0QlMjAwJTNBJTBBJTIwJTIwJTIwJTIwcHJpbnQoJTIyJUU1JUIwJTg2JUU1JTlCJUJFJUU1JTgzJThGJUU0JUJGJTlEJUU1JUFEJTk4JUU1JTg4JUIwZmx1eC5wbmclMjIpJTBBJTIwJTIwJTIwJTIwaW1hZ2Uuc2F2ZSglMjJmbHV4LnBuZyUyMiklMEElMEFkaXN0LmRlc3Ryb3lfcHJvY2Vzc19ncm91cCgp",highlighted:`<span class="hljs-keyword">import</span> time
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">import</span> torch.distributed <span class="hljs-keyword">as</span> dist
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> FluxPipeline
dist.init_process_group()
torch.cuda.set_device(dist.get_rank())
pipe = FluxPipeline.from_pretrained(
<span class="hljs-string">&quot;black-forest-labs/FLUX.1-dev&quot;</span>,
torch_dtype=torch.bfloat16,
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-keyword">from</span> para_attn.context_parallel <span class="hljs-keyword">import</span> init_context_parallel_mesh
<span class="hljs-keyword">from</span> para_attn.context_parallel.diffusers_adapters <span class="hljs-keyword">import</span> parallelize_pipe
<span class="hljs-keyword">from</span> para_attn.parallel_vae.diffusers_adapters <span class="hljs-keyword">import</span> parallelize_vae
mesh = init_context_parallel_mesh(
pipe.device.<span class="hljs-built_in">type</span>,
max_ring_dim_size=<span class="hljs-number">2</span>,
)
parallelize_pipe(
pipe,
mesh=mesh,
)
parallelize_vae(pipe.vae, mesh=mesh._flatten())
<span class="hljs-keyword">from</span> para_attn.first_block_cache.diffusers_adapters <span class="hljs-keyword">import</span> apply_cache_on_pipe
apply_cache_on_pipe(
pipe,
residual_diff_threshold=<span class="hljs-number">0.12</span>, <span class="hljs-comment"># 使用较大的值以使缓存生效</span>
)
<span class="hljs-keyword">from</span> torchao.quantization <span class="hljs-keyword">import</span> quantize_, float8_dynamic_activation_float8_weight, float8_weight_only
quantize_(pipe.text_encoder, float8_weight_only())
quantize_(pipe.transformer, float8_dynamic_activation_float8_weight())
torch._inductor.config.reorder_for_compute_comm_overlap = <span class="hljs-literal">True</span>
pipe.transformer = torch.<span class="hljs-built_in">compile</span>(
pipe.transformer, mode=<span class="hljs-string">&quot;max-autotune-no-cudagraphs&quot;</span>,
)
<span class="hljs-comment"># 启用内存节省</span>
<span class="hljs-comment"># pipe.enable_model_cpu_offload(gpu_id=dist.get_rank())</span>
<span class="hljs-comment"># pipe.enable_sequential_cpu_offload(gpu_id=dist.get_rank())</span>
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">2</span>):
begin = time.time()
image = pipe(
<span class="hljs-string">&quot;A cat holding a sign that says hello world&quot;</span>,
num_inference_steps=<span class="hljs-number">28</span>,
output_type=<span class="hljs-string">&quot;pil&quot;</span> <span class="hljs-keyword">if</span> dist.get_rank() == <span class="hljs-number">0</span> <span class="hljs-keyword">else</span> <span class="hljs-string">&quot;pt&quot;</span>,
).images[<span class="hljs-number">0</span>]
end = time.time()
<span class="hljs-keyword">if</span> dist.get_rank() == <span class="hljs-number">0</span>:
<span class="hljs-keyword">if</span> i == <span class="hljs-number">0</span>:
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;预热时间: <span class="hljs-subst">{end - begin:<span class="hljs-number">.2</span>f}</span>s&quot;</span>)
<span class="hljs-keyword">else</span>:
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;时间: <span class="hljs-subst">{end - begin:<span class="hljs-number">.2</span>f}</span>s&quot;</span>)
<span class="hljs-keyword">if</span> dist.get_rank() == <span class="hljs-number">0</span>:
<span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;将图像保存到flux.png&quot;</span>)
image.save(<span class="hljs-string">&quot;flux.png&quot;</span>)
dist.destroy_process_group()`,wrap:!1}}),m=new Q({props:{code:"JTIzJTIwJUU0JUJEJUJGJUU3JTk0JUE4LS1ucHJvY19wZXJfbm9kZSVFNiU4QyU4NyVFNSVBRSU5QUdQVSVFNiU5NSVCMCVFOSU4NyU4RiUwQXRvcmNocnVuJTIwLS1ucHJvY19wZXJfbm9kZSUzRDIlMjBydW5fZmx1eC5weQ==",highlighted:`<span class="hljs-comment"># 使用--nproc_per_node指定GPU数量</span>
torchrun --nproc_per_node=2 run_flux.py`,wrap:!1}}),{c(){t=y("p"),t.textContent=J,p=c(),b(o.$$.fragment),s=c(),a=y("p"),a.innerHTML=U,f=c(),b(m.$$.fragment),g=c(),T=y("p"),T.textContent=u},l(M){t=d(M,"P",{"data-svelte-h":!0}),h(t)!=="svelte-1wcnxcs"&&(t.textContent=J),p=r(M),Z(o.$$.fragment,M),s=r(M),a=d(M,"P",{"data-svelte-h":!0}),h(a)!=="svelte-bdq9oz"&&(a.innerHTML=U),f=r(M),Z(m.$$.fragment,M),g=r(M),T=d(M,"P",{"data-svelte-h":!0}),h(T)!=="svelte-19q14lf"&&(T.textContent=u)},m(M,w){n(M,t,w),n(M,p,w),B(o,M,w),n(M,s,w),n(M,a,w),n(M,f,w),B(m,M,w),n(M,g,w),n(M,T,w),W=!0},p:x,i(M){W||(_(o.$$.fragment,M),_(m.$$.fragment,M),W=!0)},o(M){I(o.$$.fragment,M),I(m.$$.fragment,M),W=!1},d(M){M&&(e(t),e(p),e(s),e(a),e(f),e(g),e(T)),C(o,M),C(m,M)}}}function Is(G){let t,J="以下代码示例结合了第一块缓存和上下文并行,以实现最快的推理速度。",p,o,s,a,U='保存到 <code>run_hunyuan_video.py</code> 并使用 <a href="https://pytorch.org/docs/stable/elastic/run.html" rel="nofollow">torchrun</a> 启动。',f,m,g,T,u="推理速度降低到 649.23 秒,相比基线快 5.66 倍,使用 8 个 NVIDIA L20 GPU。",W;return o=new Q({props:{code:"aW1wb3J0JTIwdGltZSUwQWltcG9ydCUyMHRvcmNoJTBBaW1wb3J0JTIwdG9yY2guZGlzdHJpYnV0ZWQlMjBhcyUyMGRpc3QlMEFmcm9tJTIwZGlmZnVzZXJzJTIwaW1wb3J0JTIwSHVueXVhblZpZGVvUGlwZWxpbmUlMkMlMjBIdW55dWFuVmlkZW9UcmFuc2Zvcm1lcjNETW9kZWwlMEFmcm9tJTIwZGlmZnVzZXJzLnV0aWxzJTIwaW1wb3J0JTIwZXhwb3J0X3RvX3ZpZGVvJTBBJTBBZGlzdC5pbml0X3Byb2Nlc3NfZ3JvdXAoKSUwQSUwQXRvcmNoLmN1ZGEuc2V0X2RldmljZShkaXN0LmdldF9yYW5rKCkpJTBBJTBBbW9kZWxfaWQlMjAlM0QlMjAlMjJ0ZW5jZW50JTJGSHVueXVhblZpZGVvJTIyJTBBdHJhbnNmb3JtZXIlMjAlM0QlMjBIdW55dWFuVmlkZW9UcmFuc2Zvcm1lcjNETW9kZWwuZnJvbV9wcmV0cmFpbmVkKCUwQSUyMCUyMCUyMCUyMG1vZGVsX2lkJTJDJTBBJTIwJTIwJTIwJTIwc3ViZm9sZGVyJTNEJTIydHJhbnNmb3JtZXIlMjIlMkMlMEElMjAlMjAlMjAlMjB0b3JjaF9kdHlwZSUzRHRvcmNoLmJmbG9hdDE2JTJDJTBBJTIwJTIwJTIwJTIwcmV2aXNpb24lM0QlMjJyZWZzJTJGcHIlMkYxOCUyMiUyQyUwQSklMEFwaXBlJTIwJTNEJTIwSHVueXVhblZpZGVvUGlwZWxpbmUuZnJvbV9wcmV0cmFpbmVkKCUwQSUyMCUyMCUyMCUyMG1vZGVsX2lkJTJDJTBBJTIwJTIwJTIwJTIwdHJhbnNmb3JtZXIlM0R0cmFuc2Zvcm1lciUyQyUwQSUyMCUyMCUyMCUyMHRvcmNoX2R0eXBlJTNEdG9yY2guZmxvYXQxNiUyQyUwQSUyMCUyMCUyMCUyMHJldmlzaW9uJTNEJTIycmVmcyUyRnByJTJGMTglMjIlMkMlMEEpLnRvKCUyMmN1ZGElMjIpJTBBJTBBZnJvbSUyMHBhcmFfYXR0bi5jb250ZXh0X3BhcmFsbGVsJTIwaW1wb3J0JTIwaW5pdF9jb250ZXh0X3BhcmFsbGVsX21lc2glMEFmcm9tJTIwcGFyYV9hdHRuLmNvbnRleHRfcGFyYWxsZWwuZGlmZnVzZXJzX2FkYXB0ZXJzJTIwaW1wb3J0JTIwcGFyYWxsZWxpemVfcGlwZSUwQWZyb20lMjBwYXJhX2F0dG4ucGFyYWxsZWxfdmFlLmRpZmZ1c2Vyc19hZGFwdGVycyUyMGltcG9ydCUyMHBhcmFsbGVsaXplX3ZhZSUwQSUwQW1lc2glMjAlM0QlMjBpbml0X2NvbnRleHRfcGFyYWxsZWxfbWVzaCglMEElMjAlMjAlMjAlMjBwaXBlLmRldmljZS50eXBlJTJDJTBBKSUwQXBhcmFsbGVsaXplX3BpcGUoJTBBJTIwJTIwJTIwJTIwcGlwZSUyQyUwQSUyMCUyMCUyMCUyMG1lc2glM0RtZXNoJTJDJTBBKSUwQXBhcmFsbGVsaXplX3ZhZShwaXBlLnZhZSUyQyUyMG1lc2glM0RtZXNoLl9mbGF0dGVuKCkpJTBBJTBBZnJvbSUyMHBhcmFfYXR0bi5maXJzdF9ibG9ja19jYWNoZS5kaWZmdXNlcnNfYWRhcHRlcnMlMjBpbXBvcnQlMjBhcHBseV9jYWNoZV9vbl9waXBlJTBBJTBBYXBwbHlfY2FjaGVfb25fcGlwZShwaXBlKSUwQSUwQSUyMyUyMGZyb20lMjB0b3JjaGFvLnF1YW50aXphdGlvbiUyMGltcG9ydCUyMHF1YW50aXplXyUyQyUyMGZsb2F0OF9keW5hbWljX2FjdGl2YXRpb25fZmxvYXQ4X3dlaWdodCUyQyUyMGZsb2F0OF93ZWlnaHRfb25seSUwQSUyMyUwQSUyMyUyMHRvcmNoLl9pbmR1Y3Rvci5jb25maWcucmVvcmRlcl9mb3JfY29tcHV0ZV9jb21tX292ZXJsYXAlMjAlM0QlMjBUcnVlJTBBJTIzJTBBJTIzJTIwcXVhbnRpemVfKHBpcGUudGV4dF9lbmNvZGVyJTJDJTIwZmxvYXQ4X3dlaWdodF9vbmx5KCkpJTBBJTIzJTIwcXVhbnRpemVfKHBpcGUudHJhbnNmb3JtZXIlMkMlMjBmbG9hdDhfZHluYW1pY19hY3RpdmF0aW9uX2Zsb2F0OF93ZWlnaHQoKSklMEElMjMlMjBwaXBlLnRyYW5zZm9ybWVyJTIwJTNEJTIwdG9yY2guY29tcGlsZSglMEElMjMlMjAlMjAlMjAlMjBwaXBlLnRyYW5zZm9ybWVyJTJDJTIwbW9kZSUzRCUyMm1heC1hdXRvdHVuZS1uby1jdWRhZ3JhcGhzJTIyJTJDJTBBJTIzJTIwKSUwQSUwQSUyMyUyMCVFNSU5MCVBRiVFNyU5NCVBOCVFNSU4NiU4NSVFNSVBRCU5OCVFOCU4QSU4MiVFNyU5QyU4MSUwQXBpcGUudmFlLmVuYWJsZV90aWxpbmcoKSUwQSUyMyUyMHBpcGUuZW5hYmxlX21vZGVsX2NwdV9vZmZsb2FkKGdwdV9pZCUzRGRpc3QuZ2V0X3JhbmsoKSklMEElMjMlMjBwaXBlLmVuYWJsZV9zZXF1ZW50aWFsX2NwdV9vZmZsb2FkKGdwdV9pZCUzRGRpc3QuZ2V0X3JhbmsoKSklMEElMEFmb3IlMjBpJTIwaW4lMjByYW5nZSgyKSUzQSUwQSUyMCUyMCUyMCUyMGJlZ2luJTIwJTNEJTIwdGltZS50aW1lKCklMEElMjAlMjAlMjAlMjBvdXRwdXQlMjAlM0QlMjBwaXBlKCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHByb21wdCUzRCUyMkElMjBjYXQlMjB3YWxrcyUyMG9uJTIwdGhlJTIwZ3Jhc3MlMkMlMjByZWFsaXN0aWMlMjIlMkMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBoZWlnaHQlM0Q3MjAlMkMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjB3aWR0aCUzRDEyODAlMkMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBudW1fZnJhbWVzJTNEMTI5JTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwbnVtX2luZmVyZW5jZV9zdGVwcyUzRDElMjBpZiUyMGklMjAlM0QlM0QlMjAwJTIwZWxzZSUyMDMwJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwb3V0cHV0X3R5cGUlM0QlMjJwaWwlMjIlMjBpZiUyMGRpc3QuZ2V0X3JhbmsoKSUyMCUzRCUzRCUyMDAlMjBlbHNlJTIwJTIycHQlMjIlMkMlMEElMjAlMjAlMjAlMjApLmZyYW1lcyU1QjAlNUQlMEElMjAlMjAlMjAlMjBlbmQlMjAlM0QlMjB0aW1lLnRpbWUoKSUwQSUyMCUyMCUyMCUyMGlmJTIwZGlzdC5nZXRfcmFuaygpJTIwJTNEJTNEJTIwMCUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGlmJTIwaSUyMCUzRCUzRCUyMDAlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBwcmludChmJTIyJUU5JUEyJTg0JUU3JTgzJUFEJUU2JTk3JUI2JUU5JTk3JUI0JTNBJTIwJTdCZW5kJTIwLSUyMGJlZ2luJTNBLjJmJTdEcyUyMiklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBlbHNlJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwcHJpbnQoZiUyMiVFNiU5NyVCNiVFOSU5NyVCNCUzQSUyMCU3QmVuZCUyMC0lMjBiZWdpbiUzQS4yZiU3RHMlMjIpJTBBJTBBaWYlMjBkaXN0LmdldF9yYW5rKCklMjAlM0QlM0QlMjAwJTNBJTBBJTIwJTIwJTIwJTIwcHJpbnQoJTIyJUU0JUJGJTlEJUU1JUFEJTk4JUU4JUE3JTg2JUU5JUEyJTkxJUU1JTg4JUIwJTIwaHVueXVhbl92aWRlby5tcDQlMjIpJTBBJTIwJTIwJTIwJTIwZXhwb3J0X3RvX3ZpZGVvKG91dHB1dCUyQyUyMCUyMmh1bnl1YW5fdmlkZW8ubXA0JTIyJTJDJTIwZnBzJTNEMTUpJTBBJTBBZGlzdC5kZXN0cm95X3Byb2Nlc3NfZ3JvdXAoKQ==",highlighted:`<span class="hljs-keyword">import</span> time
<span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">import</span> torch.distributed <span class="hljs-keyword">as</span> dist
<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
<span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> export_to_video
dist.init_process_group()
torch.cuda.set_device(dist.get_rank())
model_id = <span class="hljs-string">&quot;tencent/HunyuanVideo&quot;</span>
transformer = HunyuanVideoTransformer3DModel.from_pretrained(
model_id,
subfolder=<span class="hljs-string">&quot;transformer&quot;</span>,
torch_dtype=torch.bfloat16,
revision=<span class="hljs-string">&quot;refs/pr/18&quot;</span>,
)
pipe = HunyuanVideoPipeline.from_pretrained(
model_id,
transformer=transformer,
torch_dtype=torch.float16,
revision=<span class="hljs-string">&quot;refs/pr/18&quot;</span>,
).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-keyword">from</span> para_attn.context_parallel <span class="hljs-keyword">import</span> init_context_parallel_mesh
<span class="hljs-keyword">from</span> para_attn.context_parallel.diffusers_adapters <span class="hljs-keyword">import</span> parallelize_pipe
<span class="hljs-keyword">from</span> para_attn.parallel_vae.diffusers_adapters <span class="hljs-keyword">import</span> parallelize_vae
mesh = init_context_parallel_mesh(
pipe.device.<span class="hljs-built_in">type</span>,
)
parallelize_pipe(
pipe,
mesh=mesh,
)
parallelize_vae(pipe.vae, mesh=mesh._flatten())
<span class="hljs-keyword">from</span> para_attn.first_block_cache.diffusers_adapters <span class="hljs-keyword">import</span> apply_cache_on_pipe
apply_cache_on_pipe(pipe)
<span class="hljs-comment"># from torchao.quantization import quantize_, float8_dynamic_activation_float8_weight, float8_weight_only</span>
<span class="hljs-comment">#</span>
<span class="hljs-comment"># torch._inductor.config.reorder_for_compute_comm_overlap = True</span>
<span class="hljs-comment">#</span>
<span class="hljs-comment"># quantize_(pipe.text_encoder, float8_weight_only())</span>
<span class="hljs-comment"># quantize_(pipe.transformer, float8_dynamic_activation_float8_weight())</span>
<span class="hljs-comment"># pipe.transformer = torch.compile(</span>
<span class="hljs-comment"># pipe.transformer, mode=&quot;max-autotune-no-cudagraphs&quot;,</span>
<span class="hljs-comment"># )</span>
<span class="hljs-comment"># 启用内存节省</span>
pipe.vae.enable_tiling()
<span class="hljs-comment"># pipe.enable_model_cpu_offload(gpu_id=dist.get_rank())</span>
<span class="hljs-comment"># pipe.enable_sequential_cpu_offload(gpu_id=dist.get_rank())</span>
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">2</span>):
begin = time.time()
output = pipe(
prompt=<span class="hljs-string">&quot;A cat walks on the grass, realistic&quot;</span>,
height=<span class="hljs-number">720</span>,
width=<span class="hljs-number">1280</span>,
num_frames=<span class="hljs-number">129</span>,
num_inference_steps=<span class="hljs-number">1</span> <span class="hljs-keyword">if</span> i == <span class="hljs-number">0</span> <span class="hljs-keyword">else</span> <span class="hljs-number">30</span>,
output_type=<span class="hljs-string">&quot;pil&quot;</span> <span class="hljs-keyword">if</span> dist.get_rank() == <span class="hljs-number">0</span> <span class="hljs-keyword">else</span> <span class="hljs-string">&quot;pt&quot;</span>,
).frames[<span class="hljs-number">0</span>]
end = time.time()
<span class="hljs-keyword">if</span> dist.get_rank() == <span class="hljs-number">0</span>:
<span class="hljs-keyword">if</span> i == <span class="hljs-number">0</span>:
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;预热时间: <span class="hljs-subst">{end - begin:<span class="hljs-number">.2</span>f}</span>s&quot;</span>)
<span class="hljs-keyword">else</span>:
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;时间: <span class="hljs-subst">{end - begin:<span class="hljs-number">.2</span>f}</span>s&quot;</span>)
<span class="hljs-keyword">if</span> dist.get_rank() == <span class="hljs-number">0</span>:
<span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;保存视频到 hunyuan_video.mp4&quot;</span>)
export_to_video(output, <span class="hljs-string">&quot;hunyuan_video.mp4&quot;</span>, fps=<span class="hljs-number">15</span>)
dist.destroy_process_group()`,wrap:!1}}),m=new Q({props:{code:"JTIzJTIwJUU0JUJEJUJGJUU3JTk0JUE4JTIwLS1ucHJvY19wZXJfbm9kZSUyMCVFNiU4QyU4NyVFNSVBRSU5QSUyMEdQVSUyMCVFNiU5NSVCMCVFOSU4NyU4RiUwQXRvcmNocnVuJTIwLS1ucHJvY19wZXJfbm9kZSUzRDglMjBydW5faHVueXVhbl92aWRlby5weQ==",highlighted:`<span class="hljs-comment"># 使用 --nproc_per_node 指定 GPU 数量</span>
torchrun --nproc_per_node=8 run_hunyuan_video.py`,wrap:!1}}),{c(){t=y("p"),t.textContent=J,p=c(),b(o.$$.fragment),s=c(),a=y("p"),a.innerHTML=U,f=c(),b(m.$$.fragment),g=c(),T=y("p"),T.textContent=u},l(M){t=d(M,"P",{"data-svelte-h":!0}),h(t)!=="svelte-58tdmv"&&(t.textContent=J),p=r(M),Z(o.$$.fragment,M),s=r(M),a=d(M,"P",{"data-svelte-h":!0}),h(a)!=="svelte-pk6cu0"&&(a.innerHTML=U),f=r(M),Z(m.$$.fragment,M),g=r(M),T=d(M,"P",{"data-svelte-h":!0}),h(T)!=="svelte-nfook2"&&(T.textContent=u)},m(M,w){n(M,t,w),n(M,p,w),B(o,M,w),n(M,s,w),n(M,a,w),n(M,f,w),B(m,M,w),n(M,g,w),n(M,T,w),W=!0},p:x,i(M){W||(_(o.$$.fragment,M),_(m.$$.fragment,M),W=!0)},o(M){I(o.$$.fragment,M),I(m.$$.fragment,M),W=!1},d(M){M&&(e(t),e(p),e(s),e(a),e(f),e(g),e(T)),C(o,M),C(m,M)}}}function Cs(G){let t,J,p,o;return t=new N({props:{id:"context-parallelism",option:"FLUX-1.dev",$$slots:{default:[_s]},$$scope:{ctx:G}}}),p=new N({props:{id:"context-parallelism",option:"HunyuanVideo",$$slots:{default:[Is]},$$scope:{ctx:G}}}),{c(){b(t.$$.fragment),J=c(),b(p.$$.fragment)},l(s){Z(t.$$.fragment,s),J=r(s),Z(p.$$.fragment,s)},m(s,a){B(t,s,a),n(s,J,a),B(p,s,a),o=!0},p(s,a){const U={};a&2&&(U.$$scope={dirty:a,ctx:s}),t.$set(U);const f={};a&2&&(f.$$scope={dirty:a,ctx:s}),p.$set(f)},i(s){o||(_(t.$$.fragment,s),_(p.$$.fragment,s),o=!0)},o(s){I(t.$$.fragment,s),I(p.$$.fragment,s),o=!1},d(s){s&&e(J),C(t,s),C(p,s)}}}function Gs(G){let t,J="<thead><tr><th>GPU 类型</th> <th>GPU 数量</th> <th>优化</th> <th>墙钟时间 (s)</th> <th>加速比</th></tr></thead> <tbody><tr><td>NVIDIA L20</td> <td>1</td> <td>基线</td> <td>26.36</td> <td>1.00x</td></tr> <tr><td>NVIDIA L20</td> <td>1</td> <td>FBCache (rdt=0.08)</td> <td>17.01</td> <td>1.55x</td></tr> <tr><td>NVIDIA L20</td> <td>1</td> <td>FP8 DQ</td> <td>13.40</td> <td>1.96x</td></tr> <tr><td>NVIDIA L20</td> <td>1</td> <td>FBCache (rdt=0.12) + FP8 DQ</td> <td>7.56</td> <td>3.48x</td></tr> <tr><td>NVIDIA L20</td> <td>2</td> <td>FBCache (rdt=0.12) + FP8 DQ + CP</td> <td>4.92</td> <td>5.35x</td></tr> <tr><td>NVIDIA L20</td> <td>4</td> <td>FBCache (rdt=0.12) + FP8 DQ + CP</td> <td>3.90</td> <td>6.75x</td></tr></tbody>";return{c(){t=y("table"),t.innerHTML=J},l(p){t=d(p,"TABLE",{"data-svelte-h":!0}),h(t)!=="svelte-qm3hdc"&&(t.innerHTML=J)},m(p,o){n(p,t,o)},p:x,d(p){p&&e(t)}}}function gs(G){let t,J="<thead><tr><th>GPU 类型</th> <th>GPU 数量</th> <th>优化</th> <th>墙钟时间 (s)</th> <th>加速比</th></tr></thead> <tbody><tr><td>NVIDIA L20</td> <td>1</td> <td>基线</td> <td>3675.71</td> <td>1.00x</td></tr></tbody>",p,o,s=`| NVIDIA
L20 | 1 | FBCache | 2271.06 | 1.62x |
| NVIDIA L20 | 2 | FBCache + CP | 1132.90 | 3.24x |
| NVIDIA L20 | 4 | FBCache + CP | 718.15 | 5.12x |
| NVIDIA L20 | 8 | FBCache + CP | 649.23 | 5.66x |`;return{c(){t=y("table"),t.innerHTML=J,p=c(),o=y("p"),o.textContent=s},l(a){t=d(a,"TABLE",{"data-svelte-h":!0}),h(t)!=="svelte-g64hnd"&&(t.innerHTML=J),p=r(a),o=d(a,"P",{"data-svelte-h":!0}),h(o)!=="svelte-138nu2o"&&(o.textContent=s)},m(a,U){n(a,t,U),n(a,p,U),n(a,o,U)},p:x,d(a){a&&(e(t),e(p),e(o))}}}function Ws(G){let t,J,p,o;return t=new N({props:{id:"conclusion",option:"FLUX-1.dev",$$slots:{default:[Gs]},$$scope:{ctx:G}}}),p=new N({props:{id:"conclusion",option:"HunyuanVideo",$$slots:{default:[gs]},$$scope:{ctx:G}}}),{c(){b(t.$$.fragment),J=c(),b(p.$$.fragment)},l(s){Z(t.$$.fragment,s),J=r(s),Z(p.$$.fragment,s)},m(s,a){B(t,s,a),n(s,J,a),B(p,s,a),o=!0},p(s,a){const U={};a&2&&(U.$$scope={dirty:a,ctx:s}),t.$set(U);const f={};a&2&&(f.$$scope={dirty:a,ctx:s}),p.$set(f)},i(s){o||(_(t.$$.fragment,s),_(p.$$.fragment,s),o=!0)},o(s){I(t.$$.fragment,s),I(p.$$.fragment,s),o=!1},d(s){s&&e(J),C(t,s),C(p,s)}}}function Vs(G){let t,J,p,o,s,a,U,f,m,g='<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/flux-performance.png"/>',T,u,W='<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/hunyuan-video-performance.png"/>',M,w,ml='大型图像和视频生成模型,如 <a href="https://huggingface.co/black-forest-labs/FLUX.1-dev" rel="nofollow">FLUX.1-dev</a> 和 <a href="https://huggingface.co/tencent/HunyuanVideo" rel="nofollow">HunyuanVideo</a>,由于其规模,可能对实时应用和部署构成推理挑战。',X,k,Jl='<a href="https://github.com/chengzeyi/ParaAttention" rel="nofollow">ParaAttention</a> 是一个实现了<strong>上下文并行</strong>和<strong>第一块缓存</strong>的库,可以与其他技术(如 torch.compile、fp8 动态量化)结合使用,以加速推理。',v,j,V=`本指南将展示如何在 NVIDIA L20 GPU 上对 FLUX.1-dev 和 HunyuanVideo 应用 ParaAttention。
在我们的基线基准测试中,除了 HunyuanVideo 为避免内存不足错误外,未应用任何优化。`,dl,Y,xl="我们的基线基准测试显示,FLUX.1-dev 能够在 28 步中生成 1024x1024 分辨率图像,耗时 26.36 秒;HunyuanVideo 能够在 30 步中生成 129 帧 720p 分辨率视频,耗时 3675.71 秒。",Ul,$,Yl="<p>对于更快的上下文并行推理,请尝试使用支持 NVLink 的 NVIDIA A100 或 H100 GPU(如果可用),尤其是在 GPU 数量较多时。</p>",jl,z,hl,L,zl="缓存模型中 transformer 块的输出并在后续推理步骤中重用它们,可以降低计算成本并加速推理。",fl,q,Ll="然而,很难决定何时重用缓存以确保生成图像或视频的质量。ParaAttention 直接使用<strong>第一个 transformer 块输出的残差差异</strong>来近似模型输出之间的差异。当差异足够小时,重用先前推理步骤的残差差异。换句话说,跳过去噪步骤。",wl,D,ql="这在 FLUX.1-dev 和 HunyuanVideo 推理上实现了 2 倍加速,且质量非常好。",Tl,K,Dl='<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/ada-cache.png" alt="Cache in Diffusion Transformer"/> <figcaption>AdaCache 的工作原理,第一块缓存是其变体</figcaption>',bl,A,Zl,P,Bl,O,Kl='fp8 动态量化进一步加速推理并减少内存使用。为了使用 8 位 <a href="https://www.nvidia.com/en-us/data-center/tensor-cores/" rel="nofollow">NVIDIA Tensor Cores</a>,必须对激活和权重进行量化。',_l,ll,Pl="使用 <code>float8_weight_only</code> 和 <code>float8_dynamic_activation_float8_weight</code> 来量化文本编码器和变换器模型。",Il,sl,Ol="默认量化方法是逐张量量化,但如果您的 GPU 支持逐行量化,您也可以尝试它以获得更好的准确性。",Cl,tl,ls='使用以下命令安装 <a href="https://github.com/pytorch/ao/tree/main" rel="nofollow">torchao</a>。',Gl,el,gl,al,ss='<a href="https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html" rel="nofollow">torch.compile</a> 使用 <code>mode=&quot;max-autotune-no-cudagraphs&quot;</code> 或 <code>mode=&quot;max-autotune&quot;</code> 选择最佳内核以获得性能。如果是第一次调用模型,编译可能会花费很长时间,但一旦模型编译完成,这是值得的。',Wl,nl,ts="此示例仅量化变换器模型,但您也可以量化文本编码器以进一步减少内存使用。",Vl,E,es="<p>动态量化可能会显著改变模型输出的分布,因此您需要将 <code>residual_diff_threshold</code> 设置为更大的值以使其生效。</p>",kl,F,Xl,pl,vl,il,as="上下文并行性并行化推理并随多个 GPU 扩展。ParaAttention 组合设计允许您将上下文并行性与第一块缓存和动态量化结合使用。",Ql,R,ns='<p>请参考 <a href="https://github.com/chengzeyi/ParaAttention/tree/main" rel="nofollow">ParaAttention</a> 仓库获取详细说明和如何使用多个 GPU 扩展推理的示例。</p>',$l,ol,ps='如果推理过程需要持久化和可服务,建议使用 <a href="https://pytorch.org/docs/stable/multiprocessing.html" rel="nofollow">torch.multiprocessing</a> 编写您自己的推理处理器。这可以消除启动进程以及加载和重新编译模型的开销。',Al,H,El,Ml,Fl,S,Rl,cl,Hl,ul,Sl;return s=new js({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),U=new yl({props:{title:"ParaAttention",local:"paraattention",headingTag:"h1"}}),z=new yl({props:{title:"第一块缓存",local:"第一块缓存",headingTag:"h2"}}),A=new Nl({props:{id:"first-block-cache",options:["FLUX-1.dev","HunyuanVideo"],$$slots:{default:[Ts]},$$scope:{ctx:G}}}),P=new yl({props:{title:"fp8 量化",local:"fp8-量化",headingTag:"h2"}}),el=new Q({props:{code:"cGlwMyUyMGluc3RhbGwlMjAtVSUyMHRvcmNoJTIwdG9yY2hhbw==",highlighted:"pip3 install -U torch torchao",wrap:!1}}),F=new Nl({props:{id:"fp8-quantization",options:["FLUX-1.dev","HunyuanVideo"],$$slots:{default:[Bs]},$$scope:{ctx:G}}}),pl=new yl({props:{title:"上下文并行性",local:"上下文并行性",headingTag:"h2"}}),H=new Nl({props:{id:"context-parallelism",options:["FLUX-1.dev","HunyuanVideo"],$$slots:{default:[Cs]},$$scope:{ctx:G}}}),Ml=new yl({props:{title:"基准测试",local:"基准测试",headingTag:"h2"}}),S=new Nl({props:{id:"conclusion",options:["FLUX-1.dev","HunyuanVideo"],$$slots:{default:[Ws]},$$scope:{ctx:G}}}),cl=new hs({props:{source:"https://github.com/huggingface/diffusers/blob/main/docs/source/zh/optimization/para_attn.md"}}),{c(){t=y("meta"),J=c(),p=y("p"),o=c(),b(s.$$.fragment),a=c(),b(U.$$.fragment),f=c(),m=y("div"),m.innerHTML=g,T=c(),u=y("div"),u.innerHTML=W,M=c(),w=y("p"),w.innerHTML=ml,X=c(),k=y("p"),k.innerHTML=Jl,v=c(),j=y("p"),j.textContent=V,dl=c(),Y=y("p"),Y.textContent=xl,Ul=c(),$=y("blockquote"),$.innerHTML=Yl,jl=c(),b(z.$$.fragment),hl=c(),L=y("p"),L.textContent=zl,fl=c(),q=y("p"),q.innerHTML=Ll,wl=c(),D=y("p"),D.textContent=ql,Tl=c(),K=y("figure"),K.innerHTML=Dl,bl=c(),b(A.$$.fragment),Zl=c(),b(P.$$.fragment),Bl=c(),O=y("p"),O.innerHTML=Kl,_l=c(),ll=y("p"),ll.innerHTML=Pl,Il=c(),sl=y("p"),sl.textContent=Ol,Cl=c(),tl=y("p"),tl.innerHTML=ls,Gl=c(),b(el.$$.fragment),gl=c(),al=y("p"),al.innerHTML=ss,Wl=c(),nl=y("p"),nl.textContent=ts,Vl=c(),E=y("blockquote"),E.innerHTML=es,kl=c(),b(F.$$.fragment),Xl=c(),b(pl.$$.fragment),vl=c(),il=y("p"),il.textContent=as,Ql=c(),R=y("blockquote"),R.innerHTML=ns,$l=c(),ol=y("p"),ol.innerHTML=ps,Al=c(),b(H.$$.fragment),El=c(),b(Ml.$$.fragment),Fl=c(),b(S.$$.fragment),Rl=c(),b(cl.$$.fragment),Hl=c(),ul=y("p"),this.h()},l(l){const i=ds("svelte-u9bgzb",document.head);t=d(i,"META",{name:!0,content:!0}),i.forEach(e),J=r(l),p=d(l,"P",{}),rs(p).forEach(e),o=r(l),Z(s.$$.fragment,l),a=r(l),Z(U.$$.fragment,l),f=r(l),m=d(l,"DIV",{class:!0,"data-svelte-h":!0}),h(m)!=="svelte-1p4slnk"&&(m.innerHTML=g),T=r(l),u=d(l,"DIV",{class:!0,"data-svelte-h":!0}),h(u)!=="svelte-1kqq4mt"&&(u.innerHTML=W),M=r(l),w=d(l,"P",{"data-svelte-h":!0}),h(w)!=="svelte-1aznnck"&&(w.innerHTML=ml),X=r(l),k=d(l,"P",{"data-svelte-h":!0}),h(k)!=="svelte-nxzfeq"&&(k.innerHTML=Jl),v=r(l),j=d(l,"P",{"data-svelte-h":!0}),h(j)!=="svelte-ckrkoc"&&(j.textContent=V),dl=r(l),Y=d(l,"P",{"data-svelte-h":!0}),h(Y)!=="svelte-wz0gnx"&&(Y.textContent=xl),Ul=r(l),$=d(l,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),h($)!=="svelte-m6tkds"&&($.innerHTML=Yl),jl=r(l),Z(z.$$.fragment,l),hl=r(l),L=d(l,"P",{"data-svelte-h":!0}),h(L)!=="svelte-17ussag"&&(L.textContent=zl),fl=r(l),q=d(l,"P",{"data-svelte-h":!0}),h(q)!=="svelte-1dduett"&&(q.innerHTML=Ll),wl=r(l),D=d(l,"P",{"data-svelte-h":!0}),h(D)!=="svelte-1y9k7c"&&(D.textContent=ql),Tl=r(l),K=d(l,"FIGURE",{"data-svelte-h":!0}),h(K)!=="svelte-1cjjxth"&&(K.innerHTML=Dl),bl=r(l),Z(A.$$.fragment,l),Zl=r(l),Z(P.$$.fragment,l),Bl=r(l),O=d(l,"P",{"data-svelte-h":!0}),h(O)!=="svelte-iz99fc"&&(O.innerHTML=Kl),_l=r(l),ll=d(l,"P",{"data-svelte-h":!0}),h(ll)!=="svelte-3fzx9w"&&(ll.innerHTML=Pl),Il=r(l),sl=d(l,"P",{"data-svelte-h":!0}),h(sl)!=="svelte-f45i0k"&&(sl.textContent=Ol),Cl=r(l),tl=d(l,"P",{"data-svelte-h":!0}),h(tl)!=="svelte-1s76vee"&&(tl.innerHTML=ls),Gl=r(l),Z(el.$$.fragment,l),gl=r(l),al=d(l,"P",{"data-svelte-h":!0}),h(al)!=="svelte-ubr0pd"&&(al.innerHTML=ss),Wl=r(l),nl=d(l,"P",{"data-svelte-h":!0}),h(nl)!=="svelte-1w4d2au"&&(nl.textContent=ts),Vl=r(l),E=d(l,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),h(E)!=="svelte-5igzue"&&(E.innerHTML=es),kl=r(l),Z(F.$$.fragment,l),Xl=r(l),Z(pl.$$.fragment,l),vl=r(l),il=d(l,"P",{"data-svelte-h":!0}),h(il)!=="svelte-125hian"&&(il.textContent=as),Ql=r(l),R=d(l,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),h(R)!=="svelte-zcqlrq"&&(R.innerHTML=ns),$l=r(l),ol=d(l,"P",{"data-svelte-h":!0}),h(ol)!=="svelte-gwlw48"&&(ol.innerHTML=ps),Al=r(l),Z(H.$$.fragment,l),El=r(l),Z(Ml.$$.fragment,l),Fl=r(l),Z(S.$$.fragment,l),Rl=r(l),Z(cl.$$.fragment,l),Hl=r(l),ul=d(l,"P",{}),rs(ul).forEach(e),this.h()},h(){rl(t,"name","hf:doc:metadata"),rl(t,"content",ks),rl(m,"class","flex justify-center"),rl(u,"class","flex justify-center"),rl($,"class","tip"),rl(E,"class","tip"),rl(R,"class","tip")},m(l,i){Us(document.head,t),n(l,J,i),n(l,p,i),n(l,o,i),B(s,l,i),n(l,a,i),B(U,l,i),n(l,f,i),n(l,m,i),n(l,T,i),n(l,u,i),n(l,M,i),n(l,w,i),n(l,X,i),n(l,k,i),n(l,v,i),n(l,j,i),n(l,dl,i),n(l,Y,i),n(l,Ul,i),n(l,$,i),n(l,jl,i),B(z,l,i),n(l,hl,i),n(l,L,i),n(l,fl,i),n(l,q,i),n(l,wl,i),n(l,D,i),n(l,Tl,i),n(l,K,i),n(l,bl,i),B(A,l,i),n(l,Zl,i),B(P,l,i),n(l,Bl,i),n(l,O,i),n(l,_l,i),n(l,ll,i),n(l,Il,i),n(l,sl,i),n(l,Cl,i),n(l,tl,i),n(l,Gl,i),B(el,l,i),n(l,gl,i),n(l,al,i),n(l,Wl,i),n(l,nl,i),n(l,Vl,i),n(l,E,i),n(l,kl,i),B(F,l,i),n(l,Xl,i),B(pl,l,i),n(l,vl,i),n(l,il,i),n(l,Ql,i),n(l,R,i),n(l,$l,i),n(l,ol,i),n(l,Al,i),B(H,l,i),n(l,El,i),B(Ml,l,i),n(l,Fl,i),B(S,l,i),n(l,Rl,i),B(cl,l,i),n(l,Hl,i),n(l,ul,i),Sl=!0},p(l,[i]){const is={};i&2&&(is.$$scope={dirty:i,ctx:l}),A.$set(is);const os={};i&2&&(os.$$scope={dirty:i,ctx:l}),F.$set(os);const Ms={};i&2&&(Ms.$$scope={dirty:i,ctx:l}),H.$set(Ms);const cs={};i&2&&(cs.$$scope={dirty:i,ctx:l}),S.$set(cs)},i(l){Sl||(_(s.$$.fragment,l),_(U.$$.fragment,l),_(z.$$.fragment,l),_(A.$$.fragment,l),_(P.$$.fragment,l),_(el.$$.fragment,l),_(F.$$.fragment,l),_(pl.$$.fragment,l),_(H.$$.fragment,l),_(Ml.$$.fragment,l),_(S.$$.fragment,l),_(cl.$$.fragment,l),Sl=!0)},o(l){I(s.$$.fragment,l),I(U.$$.fragment,l),I(z.$$.fragment,l),I(A.$$.fragment,l),I(P.$$.fragment,l),I(el.$$.fragment,l),I(F.$$.fragment,l),I(pl.$$.fragment,l),I(H.$$.fragment,l),I(Ml.$$.fragment,l),I(S.$$.fragment,l),I(cl.$$.fragment,l),Sl=!1},d(l){l&&(e(J),e(p),e(o),e(a),e(f),e(m),e(T),e(u),e(M),e(w),e(X),e(k),e(v),e(j),e(dl),e(Y),e(Ul),e($),e(jl),e(hl),e(L),e(fl),e(q),e(wl),e(D),e(Tl),e(K),e(bl),e(Zl),e(Bl),e(O),e(_l),e(ll),e(Il),e(sl),e(Cl),e(tl),e(Gl),e(gl),e(al),e(Wl),e(nl),e(Vl),e(E),e(kl),e(Xl),e(vl),e(il),e(Ql),e(R),e($l),e(ol),e(Al),e(El),e(Fl),e(Rl),e(Hl),e(ul)),e(t),C(s,l),C(U,l),C(z,l),C(A,l),C(P,l),C(el,l),C(F,l),C(pl,l),C(H,l),C(Ml,l),C(S,l),C(cl,l)}}}const ks='{"title":"ParaAttention","local":"paraattention","sections":[{"title":"第一块缓存","local":"第一块缓存","sections":[],"depth":2},{"title":"fp8 量化","local":"fp8-量化","sections":[],"depth":2},{"title":"上下文并行性","local":"上下文并行性","sections":[],"depth":2},{"title":"基准测试","local":"基准测试","sections":[],"depth":2}],"depth":1}';function Xs(G){return Js(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Fs extends us{constructor(t){super(),ys(this,t,Xs,Vs,ms,{})}}export{Fs as component};

Xet Storage Details

Size:
59.5 kB
·
Xet hash:
7ae5810abdced5ee727d76cea6d7a4bd4ebd1e67f4b67939ab300074a7ae41aa

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.