Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / diffusers /pr_11739 /zh /training /distributed_inference.html

rtrm

4 months ago

download

raw

39.9 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"分布式推理","local":"分布式推理","sections":[{"title":"🤗 Accelerate","local":"-accelerate","sections":[],"depth":2},{"title":"PyTorch Distributed","local":"pytorch-distributed","sections":[],"depth":2},{"title":"模型分片","local":"模型分片","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/diffusers/pr_11739/zh/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/diffusers/pr_11739/zh/_app/immutable/entry/start.95a8faef.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_11739/zh/_app/immutable/chunks/scheduler.e4ff9b64.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_11739/zh/_app/immutable/chunks/singletons.0a6f1d19.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_11739/zh/_app/immutable/chunks/index.f9be34a7.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_11739/zh/_app/immutable/chunks/paths.37f6e25a.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_11739/zh/_app/immutable/entry/app.a988cdaf.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_11739/zh/_app/immutable/chunks/preload-helper.3e2c3f46.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_11739/zh/_app/immutable/chunks/index.09f1bca0.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_11739/zh/_app/immutable/nodes/0.0ec3fec6.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_11739/zh/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_11739/zh/_app/immutable/nodes/43.67ecf388.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_11739/zh/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.f5199cd9.js">
	<link rel="modulepreload" href="/docs/diffusers/pr_11739/zh/_app/immutable/chunks/CodeBlock.1680a1fd.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"分布式推理","local":"分布式推理","sections":[{"title":"🤗 Accelerate","local":"-accelerate","sections":[],"depth":2},{"title":"PyTorch Distributed","local":"pytorch-distributed","sections":[],"depth":2},{"title":"模型分片","local":"模型分片","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 max-sm:gap-0.5 h-6 max-sm:h-5 px-2 max-sm:px-1.5 text-[11px] max-sm:text-[9px] font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0"><svg class="w-3 h-3 max-sm:w-2.5 max-sm:h-2.5" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-6 max-sm:h-5 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible w-3 h-3 max-sm:w-2.5 max-sm:h-2.5 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="分布式推理" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#分布式推理"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>分布式推理</span></h1> <p data-svelte-h="svelte-164t5yx">在分布式设置中，您可以使用 🤗 <a href="https://huggingface.co/docs/accelerate/index" rel="nofollow">Accelerate</a> 或 <a href="https://pytorch.org/tutorials/beginner/dist_overview.html" rel="nofollow">PyTorch Distributed</a> 在多个 GPU 上运行推理，这对于并行生成多个提示非常有用。</p> <p data-svelte-h="svelte-qajcxl">本指南将向您展示如何使用 🤗 Accelerate 和 PyTorch Distributed 进行分布式推理。</p> <h2 class="relative group"><a id="-accelerate" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-accelerate"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>🤗 Accelerate</span></h2> <p data-svelte-h="svelte-10vyc10">🤗 <a href="https://huggingface.co/docs/accelerate/index" rel="nofollow">Accelerate</a> 是一个旨在简化在分布式设置中训练或运行推理的库。它简化了设置分布式环境的过程，让您可以专注于您的 PyTorch 代码。</p> <p data-svelte-h="svelte-k495sb">首先，创建一个 Python 文件并初始化一个 <code>accelerate.PartialState</code> 来创建分布式环境；您的设置会自动检测，因此您无需明确定义 <code>rank</code> 或 <code>world_size</code>。将 <code>DiffusionPipeline</code> 移动到 <code>distributed_state.device</code> 以为每个进程分配一个 GPU。</p> <p data-svelte-h="svelte-1224dwq">现在使用 <code>split_between_processes</code> 实用程序作为上下文管理器，自动在进程数之间分发提示。</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> accelerate <span class="hljs-keyword">import</span> PartialState
	<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> DiffusionPipeline

	pipeline = DiffusionPipeline.from_pretrained(
	<span class="hljs-string">"stable-diffusion-v1-5/stable-diffusion-v1-5"</span>, torch_dtype=torch.float16, use_safetensors=<span class="hljs-literal">True</span>
	)
	distributed_state = PartialState()
	pipeline.to(distributed_state.device)

	<span class="hljs-keyword">with</span> distributed_state.split_between_processes([<span class="hljs-string">"a dog"</span>, <span class="hljs-string">"a cat"</span>]) <span class="hljs-keyword">as</span> prompt:
	result = pipeline(prompt).images[<span class="hljs-number">0</span>]
	result.save(<span class="hljs-string">f"result_<span class="hljs-subst">{distributed_state.process_index}</span>.png"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1eq5z11">使用 <code>--num_processes</code> 参数指定要使用的 GPU 数量，并调用 <code>accelerate launch</code> 来运行脚本：</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->accelerate launch run_distributed.py --num_processes=2<!-- HTML_TAG_END --></pre></div> <blockquote class="tip" data-svelte-h="svelte-87d30k"><p>参考这个最小示例 <a href="https://gist.github.com/sayakpaul/cfaebd221820d7b43fae638b4dfa01ba" rel="nofollow">脚本</a> 以在多个 GPU 上运行推理。要了解更多信息，请查看 <a href="https://huggingface.co/docs/accelerate/en/usage_guides/distributed_inference#distributed-inference-with-accelerate" rel="nofollow">使用 🤗 Accelerate 进行分布式推理</a> 指南。</p></blockquote> <h2 class="relative group"><a id="pytorch-distributed" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#pytorch-distributed"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>PyTorch Distributed</span></h2> <p data-svelte-h="svelte-7gn49b">PyTorch 支持 <a href="https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html" rel="nofollow"><code>DistributedDataParallel</code></a>，它启用了数据
	并行性。</p> <p data-svelte-h="svelte-1vhpcpe">首先，创建一个 Python 文件并导入 <code>torch.distributed</code> 和 <code>torch.multiprocessing</code> 来设置分布式进程组，并为每个 GPU 上的推理生成进程。您还应该初始化一个 <code>DiffusionPipeline</code>：</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">import</span> torch.distributed <span class="hljs-keyword">as</span> dist
	<span class="hljs-keyword">import</span> torch.multiprocessing <span class="hljs-keyword">as</span> mp

	<span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> DiffusionPipeline

	sd = DiffusionPipeline.from_pretrained(
	<span class="hljs-string">"stable-diffusion-v1-5/stable-diffusion-v1-5"</span>, torch_dtype=torch.float16, use_safetensors=<span class="hljs-literal">True</span>
	)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-109hrfe">您需要创建一个函数来运行推理；<a href="https://pytorch.org/docs/stable/distributed.html?highlight=init_process_group#torch.distributed.init_process_group" rel="nofollow"><code>init_process_group</code></a> 处理创建一个分布式环境，指定要使用的后端类型、当前进程的 <code>rank</code> 以及参与进程的数量 <code>world_size</code>。如果您在 2 个 GPU 上并行运行推理，那么 <code>world_size</code> 就是 2。</p> <p data-svelte-h="svelte-1qcsr3i">将 <code>DiffusionPipeline</code> 移动到 <code>rank</code>，并使用 <code>get_rank</code> 为每个进程分配一个 GPU，其中每个进程处理不同的提示：</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">run_inference</span>(<span class="hljs-params">rank, world_size</span>):
	dist.init_process_group(<span class="hljs-string">"nccl"</span>, rank=rank, world_size=world_size)

	sd.to(rank)

	<span class="hljs-keyword">if</span> torch.distributed.get_rank() == <span class="hljs-number">0</span>:
	prompt = <span class="hljs-string">"a dog"</span>
	<span class="hljs-keyword">elif</span> torch.distributed.get_rank() == <span class="hljs-number">1</span>:
	prompt = <span class="hljs-string">"a cat"</span>

	image = sd(prompt).images[<span class="hljs-number">0</span>]
	image.save(<span class="hljs-string">f"./<span class="hljs-subst">{<span class="hljs-string">'_'</span>.join(prompt)}</span>.png"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1yf0k86">要运行分布式推理，调用 <a href="https://pytorch.org/docs/stable/multiprocessing.html#torch.multiprocessing.spawn" rel="nofollow"><code>mp.spawn</code></a> 在 <code>world_size</code> 定义的 GPU 数量上运行 <code>run_inference</code> 函数：</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">main</span>():
	world_size = <span class="hljs-number">2</span>
	mp.spawn(run_inference, args=(world_size,), nprocs=world_size, join=<span class="hljs-literal">True</span>)


	<span class="hljs-keyword">if</span> __name__ == <span class="hljs-string">"__main__"</span>:
	main()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ebzc0p">完成推理脚本后，使用 <code>--nproc_per_node</code> 参数指定要使用的 GPU 数量，并调用 <code>torchrun</code> 来运行脚本：</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->torchrun run_distributed.py --nproc_per_node=2<!-- HTML_TAG_END --></pre></div> <blockquote class="tip" data-svelte-h="svelte-zkfe1n"><p>您可以在 <code>DiffusionPipeline</code> 中使用 <code>device_map</code> 将其模型级组件分布在多个设备上。请参考 <a href="../tutorials/inference_with_big_models#device-placement">设备放置</a> 指南了解更多信息。</p></blockquote> <h2 class="relative group"><a id="模型分片" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#模型分片"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>模型分片</span></h2> <p data-svelte-h="svelte-yu6o1n">现代扩散系统，如 <a href="../api/pipelines/flux">Flux</a>，非常大且包含多个模型。例如，<a href="https://hf.co/black-forest-labs/FLUX.1-dev" rel="nofollow">Flux.1-Dev</a> 由两个文本编码器 - <a href="https://hf.co/google/t5-v1_1-xxl" rel="nofollow">T5-XXL</a> 和 <a href="https://hf.co/openai/clip-vit-large-patch14" rel="nofollow">CLIP-L</a> - 一个 <a href="../api/models/flux_transformer">扩散变换器</a>，以及一个 <a href="../api/models/autoencoderkl">VAE</a> 组成。对于如此大的模型，在消费级 GPU 上运行推理可能具有挑战性。</p> <p data-svelte-h="svelte-1wtcutz">模型分片是一种技术，当模型无法容纳在单个 GPU 上时，将模型分布在多个 GPU 上。下面的示例假设有两个 16GB GPU 可用于推理。</p> <p data-svelte-h="svelte-vjsvbo">开始使用文本编码器计算文本嵌入。通过设置 <code>device_map="balanced"</code> 将文本编码器保持在两个GPU上。<code>balanced</code> 策略将模型均匀分布在所有可用GPU上。使用 <code>max_memory</code> 参数为每个GPU上的每个文本编码器分配最大内存量。</p> <blockquote class="tip" data-svelte-h="svelte-zyi0qz"><p><strong>仅</strong> 在此步骤加载文本编码器！扩散变换器和VAE在后续步骤中加载以节省内存。</p></blockquote> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> FluxPipeline
	<span class="hljs-keyword">import</span> torch

	prompt = <span class="hljs-string">"a photo of a dog with cat-like look"</span>

	pipeline = FluxPipeline.from_pretrained(
	<span class="hljs-string">"black-forest-labs/FLUX.1-dev"</span>,
	transformer=<span class="hljs-literal">None</span>,
	vae=<span class="hljs-literal">None</span>,
	device_map=<span class="hljs-string">"balanced"</span>,
	max_memory={<span class="hljs-number">0</span>: <span class="hljs-string">"16GB"</span>, <span class="hljs-number">1</span>: <span class="hljs-string">"16GB"</span>},
	torch_dtype=torch.bfloat16
	)
	<span class="hljs-keyword">with</span> torch.no_grad():
	<span class="hljs-built_in">print</span>(<span class="hljs-string">"Encoding prompts."</span>)
	prompt_embeds, pooled_prompt_embeds, text_ids = pipeline.encode_prompt(
	prompt=prompt, prompt_2=<span class="hljs-literal">None</span>, max_sequence_length=<span class="hljs-number">512</span>
	)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-101ikwk">一旦文本嵌入计算完成，从GPU中移除它们以为扩散变换器腾出空间。</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> gc

	<span class="hljs-keyword">def</span> <span class="hljs-title function_">flush</span>():
	gc.collect()
	torch.cuda.empty_cache()
	torch.cuda.reset_max_memory_allocated()
	torch.cuda.reset_peak_memory_stats()

	<span class="hljs-keyword">del</span> pipeline.text_encoder
	<span class="hljs-keyword">del</span> pipeline.text_encoder_2
	<span class="hljs-keyword">del</span> pipeline.tokenizer
	<span class="hljs-keyword">del</span> pipeline.tokenizer_2
	<span class="hljs-keyword">del</span> pipeline

	flush()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-i97tc3">接下来加载扩散变换器，它有125亿参数。这次，设置 <code>device_map="auto"</code> 以自动将模型分布在两个16GB GPU上。<code>auto</code> 策略由 <a href="https://hf.co/docs/accelerate/index" rel="nofollow">Accelerate</a> 支持，并作为 <a href="https://hf.co/docs/accelerate/concept_guides/big_model_inference" rel="nofollow">大模型推理</a> 功能的一部分可用。它首先将模型分布在最快的设备（GPU）上，然后在需要时移动到较慢的设备如CPU和硬盘。将模型参数存储在较慢设备上的权衡是推理延迟较慢。</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> AutoModel
	<span class="hljs-keyword">import</span> torch

	transformer = AutoModel.from_pretrained(
	<span class="hljs-string">"black-forest-labs/FLUX.1-dev"</span>,
	subfolder=<span class="hljs-string">"transformer"</span>,
	device_map=<span class="hljs-string">"auto"</span>,
	torch_dtype=torch.bfloat16
	)<!-- HTML_TAG_END --></pre></div> <blockquote class="tip" data-svelte-h="svelte-1l0lzn4"><p>在任何时候，您可以尝试 <code>print(pipeline.hf_device_map)</code> 来查看各种模型如何在设备上分布。这对于跟踪模型的设备放置很有用。您也可以尝试 <code>print(transformer.hf_device_map)</code> 来查看变换器模型如何在设备上分片。</p></blockquote> <p data-svelte-h="svelte-1lgsyuc">将变换器模型添加到管道中以进行去噪，但将其他模型级组件如文本编码器和VAE设置为 <code>None</code>，因为您还不需要它们。</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pipeline = FluxPipeline.from_pretrained(
	<span class="hljs-string">"black-forest-labs/FLUX.1-dev"</span>,
	text_encoder=<span class="hljs-literal">None</span>,
	text_encoder_2=<span class="hljs-literal">None</span>,
	tokenizer=<span class="hljs-literal">None</span>,
	tokenizer_2=<span class="hljs-literal">None</span>,
	vae=<span class="hljs-literal">None</span>,
	transformer=transformer,
	torch_dtype=torch.bfloat16
	)

	<span class="hljs-built_in">print</span>(<span class="hljs-string">"Running denoising."</span>)
	height, width = <span class="hljs-number">768</span>, <span class="hljs-number">1360</span>
	latents = pipeline(


	prompt_embeds=prompt_embeds,
	pooled_prompt_embeds=pooled_prompt_embeds,
	num_inference_steps=<span class="hljs-number">50</span>,
	guidance_scale=<span class="hljs-number">3.5</span>,
	height=height,
	width=width,
	output_type=<span class="hljs-string">"latent"</span>,
	).images<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-x0nor">从内存中移除管道和变换器，因为它们不再需要。</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">del</span> pipeline.transformer
	<span class="hljs-keyword">del</span> pipeline

	flush()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1v7ewyt">最后，使用变分自编码器（VAE）将潜在表示解码为图像。VAE通常足够小，可以在单个GPU上加载。</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> AutoencoderKL
	<span class="hljs-keyword">from</span> diffusers.image_processor <span class="hljs-keyword">import</span> VaeImageProcessor
	<span class="hljs-keyword">import</span> torch

	vae = AutoencoderKL.from_pretrained(ckpt_id, subfolder=<span class="hljs-string">"vae"</span>, torch_dtype=torch.bfloat16).to(<span class="hljs-string">"cuda"</span>)
	vae_scale_factor = <span class="hljs-number">2</span> ** (<span class="hljs-built_in">len</span>(vae.config.block_out_channels) - <span class="hljs-number">1</span>)
	image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor)

	<span class="hljs-keyword">with</span> torch.no_grad():
	<span class="hljs-built_in">print</span>(<span class="hljs-string">"运行解码中。"</span>)
	latents = FluxPipeline._unpack_latents(latents, height, width, vae_scale_factor)
	latents = (latents / vae.config.scaling_factor) + vae.config.shift_factor

	image = vae.decode(latents, return_dict=<span class="hljs-literal">False</span>)[<span class="hljs-number">0</span>]
	image = image_processor.postprocess(image, output_type=<span class="hljs-string">"pil"</span>)
	image[<span class="hljs-number">0</span>].save(<span class="hljs-string">"split_transformer.png"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-19rooo6">通过选择性加载和卸载在特定阶段所需的模型，并将最大模型分片到多个GPU上，可以在消费级GPU上运行大型模型的推理。</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/diffusers/blob/main/docs/source/zh/training/distributed_inference.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_pa8cr6 = {
	assets: "/docs/diffusers/pr_11739/zh",
	base: "/docs/diffusers/pr_11739/zh",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/diffusers/pr_11739/zh/_app/immutable/entry/start.95a8faef.js"),
	import("/docs/diffusers/pr_11739/zh/_app/immutable/entry/app.a988cdaf.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 43],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 39.9 kB
Xet hash:: 932d2bd8d4e3457283f5a220573fae8178f3d8dabc3b248df7c1bbcc0f1446e9

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.