2.5/distributed.html



<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
<head>
  <meta name="robots" content="noindex">
  <meta charset="utf-8">
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
  <title>Distributed communication package - torch.distributed &mdash; PyTorch 2.5 documentation</title>
  

    <link rel="canonical" href="https://pytorch.org/docs/stable/distributed.html"/>
  

  <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
  <!-- <link rel="stylesheet" href="_static/pygments.css" type="text/css" /> -->
  <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
  <link rel="stylesheet" href="_static/copybutton.css" type="text/css" />
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css" type="text/css" />
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.13.11/dist/katex.min.css" type="text/css" />
  <link rel="stylesheet" href="_static/katex-math.css" type="text/css" />
  <link rel="stylesheet" href="_static/sphinx-dropdown.css" type="text/css" />
  <link rel="stylesheet" href="_static/panels-bootstrap.min.css" type="text/css" />
  <link rel="stylesheet" href="_static/css/jit.css" type="text/css" />
    <link rel="index" title="Index" href="genindex.html" />
    <link rel="search" title="Search" href="search.html" />
    <link rel="next" title="torch.distributed.tensor" href="distributed.tensor.html" />
    <link rel="prev" title="Fake tensor" href="torch.compiler_fake_tensor.html" />


  <!-- Google Tag Manager -->
    <script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
    new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
    j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
    'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
    })(window,document,'script','dataLayer','GTM-T8XT4PS');</script>
    <!-- End Google Tag Manager -->
  

  <script src="_static/js/modernizr.min.js"></script>

  <!-- Preload the theme fonts -->

<link rel="preload" href="_static/fonts/FreightSans/freight-sans-book.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="_static/fonts/FreightSans/freight-sans-medium.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="_static/fonts/IBMPlexMono/IBMPlexMono-Medium.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="_static/fonts/FreightSans/freight-sans-bold.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="_static/fonts/FreightSans/freight-sans-medium-italic.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="_static/fonts/IBMPlexMono/IBMPlexMono-SemiBold.woff2" as="font" type="font/woff2" crossorigin="anonymous">

<!-- Preload the katex fonts -->

<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Math-Italic.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Main-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Main-Bold.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Size1-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Size4-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Size2-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Size3-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
<link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Caligraphic-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous">
  <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.2/css/all.css" integrity="sha384-vSIIfh2YWi9wW0r9iZe7RJPrKwp6bG+s9QZMoITbCckVJqGCCRhc+ccxNcdpHuYu" crossorigin="anonymous">
</head>

<div class="container-fluid header-holder tutorials-header" id="header-holder">
  <div class="container">
    <div class="header-container">
      <a class="header-logo" href="https://pytorch.org/" aria-label="PyTorch"></a>

      <div class="main-menu">
        <ul>

          <li class="main-menu-item">
          <div id="resourcesDropdownButton" data-toggle="resources-dropdown" class="resources-dropdown">
              <a class="with-down-arrow">
                Learn
              </a>
              <div class="resources-dropdown-menu">
                <a class="nav-dropdown-item" href="https://pytorch.org/get-started">
                  <span class=dropdown-title>Get Started</span>
                  <p>Run PyTorch locally or get started quickly with one of the supported cloud platforms</p>
                </a>
                <a class="nav-dropdown-item" href="https://pytorch.org/tutorials">
                  <span class="dropdown-title">Tutorials</span>
                  <p>Whats new in PyTorch tutorials</p>
                </a>
                <a class="nav-dropdown-item" href="https://pytorch.org/tutorials/beginner/basics/intro.html">
                  <span class="dropdown-title">Learn the Basics</span>
                  <p>Familiarize yourself with PyTorch concepts and modules</p>
                </a>
                <a class="nav-dropdown-item" href="https://pytorch.org/tutorials/recipes/recipes_index.html">
                  <span class="dropdown-title">PyTorch Recipes</span>
                  <p>Bite-size, ready-to-deploy PyTorch code examples</p>
                </a>
                <a class="nav-dropdown-item" href="https://pytorch.org/tutorials/beginner/introyt.html">
                  <span class="dropdown-title">Intro to PyTorch - YouTube Series</span>
                  <p>Master PyTorch basics with our engaging YouTube tutorial series</p>
                </a>
              </div>
            </div>
          </li>

          <li>
          <div id="resourcesDropdownButton" data-toggle="resources-dropdown" class="resources-dropdown">
              <a class="with-down-arrow">
                Ecosystem
              </a>
              <div class="resources-dropdown-menu">
                <a class="nav-dropdown-item" href="https://pytorch.org/ecosystem">
                  <span class="dropdown-title">Tools</span>
                  <p>Learn about the tools and frameworks in the PyTorch Ecosystem</p>
                </a>
                <a class="nav-dropdown-item" href="https://pytorch.org/#community-module">
                  <span class=dropdown-title>Community</span>
                  <p>Join the PyTorch developer community to contribute, learn, and get your questions answered</p>
                </a>
                <a class="nav-dropdown-item" href="https://discuss.pytorch.org/" target="_blank">
                  <span class=dropdown-title>Forums</span>
                  <p>A place to discuss PyTorch code, issues, install, research</p>
                </a>
                <a class="nav-dropdown-item" href="https://pytorch.org/resources">
                  <span class=dropdown-title>Developer Resources</span>
                  <p>Find resources and get questions answered</p>
                </a>
                <a class="nav-dropdown-item" href="https://pytorch.org/ecosystem/contributor-awards-2023">
                  <span class="dropdown-title">Contributor Awards - 2023</span>
                  <p>Award winners announced at this year's PyTorch Conference</p>
                </a>
              </div>
            </div>
          </li>

          <li>
          <div id="resourcesDropdownButton" data-toggle="resources-dropdown" class="resources-dropdown">
              <a class="with-down-arrow">
                Edge
              </a>
              <div class="resources-dropdown-menu">
                <a class="nav-dropdown-item" href="https://pytorch.org/edge">
                  <span class="dropdown-title">About PyTorch Edge</span>
                  <p>Build innovative and privacy-aware AI experiences for edge devices</p>
                </a>
                <a class="nav-dropdown-item" href="https://pytorch.org/executorch-overview">
                  <span class="dropdown-title">ExecuTorch</span>
                  <p>End-to-end solution for enabling on-device inference capabilities across mobile and edge devices</p>
                </a>
              </div>
            </div>  
          </li>

          <li class="main-menu-item">
            <div id="resourcesDropdownButton" data-toggle="resources-dropdown" class="resources-dropdown">
              <a class="with-down-arrow">
                Docs
              </a>
              <div class="resources-dropdown-menu">
                <a class="nav-dropdown-item" href="https://pytorch.org/docs/stable/index.html">
                  <span class="dropdown-title">PyTorch</span>
                  <p>Explore the documentation for comprehensive guidance on how to use PyTorch</p>
                </a>
                <a class="nav-dropdown-item" href="https://pytorch.org/pytorch-domains">
                  <span class="dropdown-title">PyTorch Domains</span>
                  <p>Read the PyTorch Domains documentation to learn more about domain-specific libraries</p>
                </a>
              </div>
            </div>
          </li>

          <li>
            <div id="resourcesDropdownButton" data-toggle="resources-dropdown" class="resources-dropdown">
              <a class="with-down-arrow">
                Blogs & News 
              </a>
              <div class="resources-dropdown-menu">
                <a class="nav-dropdown-item" href="https://pytorch.org/blog/">
                  <span class="dropdown-title">PyTorch Blog</span>
                  <p>Catch up on the latest technical news and happenings</p>
                </a>
                 <a class="nav-dropdown-item" href="https://pytorch.org/community-blog">
                  <span class="dropdown-title">Community Blog</span>
                  <p>Stories from the PyTorch ecosystem</p>
                </a>
                <a class="nav-dropdown-item" href="https://pytorch.org/videos">
                  <span class="dropdown-title">Videos</span>
                  <p>Learn about the latest PyTorch tutorials, new, and more </p>
                <a class="nav-dropdown-item" href="https://pytorch.org/community-stories">
                  <span class="dropdown-title">Community Stories</span>
                  <p>Learn how our community solves real, everyday machine learning problems with PyTorch</p>
                </a>
                <a class="nav-dropdown-item" href="https://pytorch.org/events">
                  <span class="dropdown-title">Events</span>
                  <p>Find events, webinars, and podcasts</p>
                </a>
            </div>
          </li>

          <li>
            <div id="resourcesDropdownButton" data-toggle="resources-dropdown" class="resources-dropdown">
              <a class="with-down-arrow">
                About
              </a>
              <div class="resources-dropdown-menu">
                <a class="nav-dropdown-item" href="https://pytorch.org/foundation">
                  <span class="dropdown-title">PyTorch Foundation</span>
                  <p>Learn more about the PyTorch Foundation</p>
                </a>
                <a class="nav-dropdown-item" href="https://pytorch.org/governing-board">
                  <span class="dropdown-title">Governing Board</span>
                  <p></p>
                </a>
              </div>
            </div>
          </li>

          <li class="main-menu-item">
            <div class="no-dropdown">
              <a href="https://pytorch.org/join" data-cta="join">
                Become a Member
              </a>
            </div>
          </li>
          <li>
           <div class="main-menu-item">
             <a href="https://github.com/pytorch/pytorch" class="github-icon">
             </a>
           </div>
          </li>
          <!--- TODO: This block adds the search icon to the nav bar. We will enable it later. 
          <li>
            <div class="main-menu-item">
             <a href="https://github.com/pytorch/pytorch" class="search-icon">
             </a>
            </div>
          </li>
          --->
        </ul>
      </div>

      <a class="main-menu-open-button" href="#" data-behavior="open-mobile-menu"></a>
    </div>
  </div>
</div>

<body class="pytorch-body">

   
    <div class="table-of-contents-link-wrapper">
      <span>Table of Contents</span>
      <a href="#" class="toggle-table-of-contents" data-behavior="toggle-table-of-contents"></a>
    </div>

    <nav data-toggle="wy-nav-shift" class="pytorch-left-menu" id="pytorch-left-menu">
      <div class="pytorch-side-scroll">
        <div class="pytorch-menu pytorch-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
          <div class="pytorch-left-menu-search">
            
    <div class="version">
      <a href='https://pytorch.org/docs/versions.html'>2.5 &#x25BC</a>
    </div>
    

<div role="search">
  <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
    <input type="text" name="q" placeholder="Search Docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>

          </div>

          
              <p class="caption" role="heading"><span class="caption-text">Community</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="community/build_ci_governance.html">PyTorch Governance | Build + CI</a></li>
<li class="toctree-l1"><a class="reference internal" href="community/contribution_guide.html">PyTorch Contribution Guide</a></li>
<li class="toctree-l1"><a class="reference internal" href="community/design.html">PyTorch Design Philosophy</a></li>
<li class="toctree-l1"><a class="reference internal" href="community/governance.html">PyTorch Governance | Mechanics</a></li>
<li class="toctree-l1"><a class="reference internal" href="community/persons_of_interest.html">PyTorch Governance | Maintainers</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Developer Notes</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="notes/amp_examples.html">Automatic Mixed Precision examples</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/autograd.html">Autograd mechanics</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/broadcasting.html">Broadcasting semantics</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/cpu_threading_torchscript_inference.html">CPU threading and TorchScript inference</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/cuda.html">CUDA semantics</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/custom_operators.html">PyTorch Custom Operators Landing Page</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/ddp.html">Distributed Data Parallel</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/extending.html">Extending PyTorch</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/extending.func.html">Extending torch.func with autograd.Function</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/faq.html">Frequently Asked Questions</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/fsdp.html">FSDP Notes</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/get_start_xpu.html">Getting Started on Intel GPU</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/gradcheck.html">Gradcheck mechanics</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/hip.html">HIP (ROCm) semantics</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/large_scale_deployments.html">Features for large-scale deployments</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/modules.html">Modules</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/mps.html">MPS backend</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/multiprocessing.html">Multiprocessing best practices</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/numerical_accuracy.html">Numerical accuracy</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/randomness.html">Reproducibility</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/serialization.html">Serialization semantics</a></li>
<li class="toctree-l1"><a class="reference internal" href="notes/windows.html">Windows FAQ</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Language Bindings</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="cpp_index.html">C++</a></li>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/javadoc/">Javadoc</a></li>
<li class="toctree-l1"><a class="reference internal" href="deploy.html">torch::deploy</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Python API</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="torch.html">torch</a></li>
<li class="toctree-l1"><a class="reference internal" href="nn.html">torch.nn</a></li>
<li class="toctree-l1"><a class="reference internal" href="nn.functional.html">torch.nn.functional</a></li>
<li class="toctree-l1"><a class="reference internal" href="tensors.html">torch.Tensor</a></li>
<li class="toctree-l1"><a class="reference internal" href="tensor_attributes.html">Tensor Attributes</a></li>
<li class="toctree-l1"><a class="reference internal" href="tensor_view.html">Tensor Views</a></li>
<li class="toctree-l1"><a class="reference internal" href="amp.html">torch.amp</a></li>
<li class="toctree-l1"><a class="reference internal" href="autograd.html">torch.autograd</a></li>
<li class="toctree-l1"><a class="reference internal" href="library.html">torch.library</a></li>
<li class="toctree-l1"><a class="reference internal" href="cpu.html">torch.cpu</a></li>
<li class="toctree-l1"><a class="reference internal" href="cuda.html">torch.cuda</a></li>
<li class="toctree-l1"><a class="reference internal" href="torch_cuda_memory.html">Understanding CUDA Memory Usage</a></li>
<li class="toctree-l1"><a class="reference internal" href="torch_cuda_memory.html#generating-a-snapshot">Generating a Snapshot</a></li>
<li class="toctree-l1"><a class="reference internal" href="torch_cuda_memory.html#using-the-visualizer">Using the visualizer</a></li>
<li class="toctree-l1"><a class="reference internal" href="torch_cuda_memory.html#snapshot-api-reference">Snapshot API Reference</a></li>
<li class="toctree-l1"><a class="reference internal" href="mps.html">torch.mps</a></li>
<li class="toctree-l1"><a class="reference internal" href="xpu.html">torch.xpu</a></li>
<li class="toctree-l1"><a class="reference internal" href="mtia.html">torch.mtia</a></li>
<li class="toctree-l1"><a class="reference internal" href="meta.html">Meta device</a></li>
<li class="toctree-l1"><a class="reference internal" href="backends.html">torch.backends</a></li>
<li class="toctree-l1"><a class="reference internal" href="export.html">torch.export</a></li>
<li class="toctree-l1 current"><a class="current reference internal" href="#">torch.distributed</a></li>
<li class="toctree-l1"><a class="reference internal" href="distributed.tensor.html">torch.distributed.tensor</a></li>
<li class="toctree-l1"><a class="reference internal" href="distributed.algorithms.join.html">torch.distributed.algorithms.join</a></li>
<li class="toctree-l1"><a class="reference internal" href="distributed.elastic.html">torch.distributed.elastic</a></li>
<li class="toctree-l1"><a class="reference internal" href="fsdp.html">torch.distributed.fsdp</a></li>
<li class="toctree-l1"><a class="reference internal" href="distributed.tensor.parallel.html">torch.distributed.tensor.parallel</a></li>
<li class="toctree-l1"><a class="reference internal" href="distributed.optim.html">torch.distributed.optim</a></li>
<li class="toctree-l1"><a class="reference internal" href="distributed.pipelining.html">torch.distributed.pipelining</a></li>
<li class="toctree-l1"><a class="reference internal" href="distributed.checkpoint.html">torch.distributed.checkpoint</a></li>
<li class="toctree-l1"><a class="reference internal" href="distributions.html">torch.distributions</a></li>
<li class="toctree-l1"><a class="reference internal" href="torch.compiler.html">torch.compiler</a></li>
<li class="toctree-l1"><a class="reference internal" href="fft.html">torch.fft</a></li>
<li class="toctree-l1"><a class="reference internal" href="func.html">torch.func</a></li>
<li class="toctree-l1"><a class="reference internal" href="futures.html">torch.futures</a></li>
<li class="toctree-l1"><a class="reference internal" href="fx.html">torch.fx</a></li>
<li class="toctree-l1"><a class="reference internal" href="fx.experimental.html">torch.fx.experimental</a></li>
<li class="toctree-l1"><a class="reference internal" href="hub.html">torch.hub</a></li>
<li class="toctree-l1"><a class="reference internal" href="jit.html">torch.jit</a></li>
<li class="toctree-l1"><a class="reference internal" href="linalg.html">torch.linalg</a></li>
<li class="toctree-l1"><a class="reference internal" href="monitor.html">torch.monitor</a></li>
<li class="toctree-l1"><a class="reference internal" href="signal.html">torch.signal</a></li>
<li class="toctree-l1"><a class="reference internal" href="special.html">torch.special</a></li>
<li class="toctree-l1"><a class="reference internal" href="torch.overrides.html">torch.overrides</a></li>
<li class="toctree-l1"><a class="reference internal" href="package.html">torch.package</a></li>
<li class="toctree-l1"><a class="reference internal" href="profiler.html">torch.profiler</a></li>
<li class="toctree-l1"><a class="reference internal" href="nn.init.html">torch.nn.init</a></li>
<li class="toctree-l1"><a class="reference internal" href="nn.attention.html">torch.nn.attention</a></li>
<li class="toctree-l1"><a class="reference internal" href="onnx.html">torch.onnx</a></li>
<li class="toctree-l1"><a class="reference internal" href="optim.html">torch.optim</a></li>
<li class="toctree-l1"><a class="reference internal" href="complex_numbers.html">Complex Numbers</a></li>
<li class="toctree-l1"><a class="reference internal" href="ddp_comm_hooks.html">DDP Communication Hooks</a></li>
<li class="toctree-l1"><a class="reference internal" href="quantization.html">Quantization</a></li>
<li class="toctree-l1"><a class="reference internal" href="rpc.html">Distributed RPC Framework</a></li>
<li class="toctree-l1"><a class="reference internal" href="random.html">torch.random</a></li>
<li class="toctree-l1"><a class="reference internal" href="masked.html">torch.masked</a></li>
<li class="toctree-l1"><a class="reference internal" href="nested.html">torch.nested</a></li>
<li class="toctree-l1"><a class="reference internal" href="size.html">torch.Size</a></li>
<li class="toctree-l1"><a class="reference internal" href="sparse.html">torch.sparse</a></li>
<li class="toctree-l1"><a class="reference internal" href="storage.html">torch.Storage</a></li>
<li class="toctree-l1"><a class="reference internal" href="testing.html">torch.testing</a></li>
<li class="toctree-l1"><a class="reference internal" href="utils.html">torch.utils</a></li>
<li class="toctree-l1"><a class="reference internal" href="benchmark_utils.html">torch.utils.benchmark</a></li>
<li class="toctree-l1"><a class="reference internal" href="bottleneck.html">torch.utils.bottleneck</a></li>
<li class="toctree-l1"><a class="reference internal" href="checkpoint.html">torch.utils.checkpoint</a></li>
<li class="toctree-l1"><a class="reference internal" href="cpp_extension.html">torch.utils.cpp_extension</a></li>
<li class="toctree-l1"><a class="reference internal" href="data.html">torch.utils.data</a></li>
<li class="toctree-l1"><a class="reference internal" href="deterministic.html">torch.utils.deterministic</a></li>
<li class="toctree-l1"><a class="reference internal" href="jit_utils.html">torch.utils.jit</a></li>
<li class="toctree-l1"><a class="reference internal" href="dlpack.html">torch.utils.dlpack</a></li>
<li class="toctree-l1"><a class="reference internal" href="mobile_optimizer.html">torch.utils.mobile_optimizer</a></li>
<li class="toctree-l1"><a class="reference internal" href="model_zoo.html">torch.utils.model_zoo</a></li>
<li class="toctree-l1"><a class="reference internal" href="tensorboard.html">torch.utils.tensorboard</a></li>
<li class="toctree-l1"><a class="reference internal" href="module_tracker.html">torch.utils.module_tracker</a></li>
<li class="toctree-l1"><a class="reference internal" href="type_info.html">Type Info</a></li>
<li class="toctree-l1"><a class="reference internal" href="named_tensor.html">Named Tensors</a></li>
<li class="toctree-l1"><a class="reference internal" href="name_inference.html">Named Tensors operator coverage</a></li>
<li class="toctree-l1"><a class="reference internal" href="config_mod.html">torch.__config__</a></li>
<li class="toctree-l1"><a class="reference internal" href="future_mod.html">torch.__future__</a></li>
<li class="toctree-l1"><a class="reference internal" href="logging.html">torch._logging</a></li>
<li class="toctree-l1"><a class="reference internal" href="torch_environment_variables.html">Torch Environment Variables</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Libraries</span></p>
<ul>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/audio/stable">torchaudio</a></li>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/data">TorchData</a></li>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/torchrec">TorchRec</a></li>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/serve">TorchServe</a></li>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/text/stable">torchtext</a></li>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/vision/stable">torchvision</a></li>
<li class="toctree-l1"><a class="reference external" href="https://pytorch.org/xla/">PyTorch on XLA Devices</a></li>
</ul>

            
        </div>
      </div>
    </nav>

    <div class="pytorch-container">
      <div class="pytorch-page-level-bar" id="pytorch-page-level-bar">
        <div class="pytorch-breadcrumbs-wrapper">
          

<div role="navigation" aria-label="breadcrumbs navigation">

  <ul class="pytorch-breadcrumbs">
    
      <li>
        <a href="index.html">
          
            Docs
          
        </a> &gt;
      </li>

        
      <li>Distributed communication package - torch.distributed</li>
    
    
      <li class="pytorch-breadcrumbs-aside">
        
            
            <a href="_sources/distributed.rst.txt" rel="nofollow"><img src="_static/images/view-page-source-icon.svg"></a>
          
        
      </li>
    
  </ul>

  
</div>
        </div>

        <div class="pytorch-shortcuts-wrapper" id="pytorch-shortcuts-wrapper">
          Shortcuts
        </div>
      </div>

      <section data-toggle="wy-nav-shift" id="pytorch-content-wrap" class="pytorch-content-wrap">
        <div class="pytorch-content-left">

        
          <!-- Google Tag Manager (noscript) -->
          <noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-T8XT4PS"
          height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript>
          <!-- End Google Tag Manager (noscript) -->
          
          <div class="rst-content">
          
            <div role="main" class="main-content" itemscope="itemscope" itemtype="http://schema.org/Article">
             <article itemprop="articleBody" id="pytorch-article" class="pytorch-article">
              
  <div class="section" id="distributed-communication-package-torch-distributed">
<h1>Distributed communication package - torch.distributed<a class="headerlink" href="#distributed-communication-package-torch-distributed" title="Permalink to this heading">¶</a></h1>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>Please refer to <a class="reference external" href="https://pytorch.org/tutorials/beginner/dist_overview.html">PyTorch Distributed Overview</a>
for a brief introduction to all features related to distributed training.</p>
</div>
<span class="target" id="module-torch.distributed"></span><div class="section" id="backends">
<h2>Backends<a class="headerlink" href="#backends" title="Permalink to this heading">¶</a></h2>
<p><code class="docutils literal notranslate"><span class="pre">torch.distributed</span></code> supports three built-in backends, each with
different capabilities. The table below shows which functions are available
for use with CPU / CUDA tensors.
MPI supports CUDA only if the implementation used to build PyTorch supports it.</p>
<table class="docutils colwidths-auto align-default">
<thead>
<tr class="row-odd"><th class="head"><p>Backend</p></th>
<th class="head" colspan="2"><p><code class="docutils literal notranslate"><span class="pre">gloo</span></code></p></th>
<th class="head" colspan="2"><p><code class="docutils literal notranslate"><span class="pre">mpi</span></code></p></th>
<th class="head" colspan="2"><p><code class="docutils literal notranslate"><span class="pre">nccl</span></code></p></th>
</tr>
<tr class="row-even"><th class="head"><p>Device</p></th>
<th class="head"><p>CPU</p></th>
<th class="head"><p>GPU</p></th>
<th class="head"><p>CPU</p></th>
<th class="head"><p>GPU</p></th>
<th class="head"><p>CPU</p></th>
<th class="head"><p>GPU</p></th>
</tr>
</thead>
<tbody>
<tr class="row-odd"><td><p>send</p></td>
<td><p>✓</p></td>
<td><p>✘</p></td>
<td><p>✓</p></td>
<td><p>?</p></td>
<td><p>✘</p></td>
<td><p>✓</p></td>
</tr>
<tr class="row-even"><td><p>recv</p></td>
<td><p>✓</p></td>
<td><p>✘</p></td>
<td><p>✓</p></td>
<td><p>?</p></td>
<td><p>✘</p></td>
<td><p>✓</p></td>
</tr>
<tr class="row-odd"><td><p>broadcast</p></td>
<td><p>✓</p></td>
<td><p>✓</p></td>
<td><p>✓</p></td>
<td><p>?</p></td>
<td><p>✘</p></td>
<td><p>✓</p></td>
</tr>
<tr class="row-even"><td><p>all_reduce</p></td>
<td><p>✓</p></td>
<td><p>✓</p></td>
<td><p>✓</p></td>
<td><p>?</p></td>
<td><p>✘</p></td>
<td><p>✓</p></td>
</tr>
<tr class="row-odd"><td><p>reduce</p></td>
<td><p>✓</p></td>
<td><p>✘</p></td>
<td><p>✓</p></td>
<td><p>?</p></td>
<td><p>✘</p></td>
<td><p>✓</p></td>
</tr>
<tr class="row-even"><td><p>all_gather</p></td>
<td><p>✓</p></td>
<td><p>✘</p></td>
<td><p>✓</p></td>
<td><p>?</p></td>
<td><p>✘</p></td>
<td><p>✓</p></td>
</tr>
<tr class="row-odd"><td><p>gather</p></td>
<td><p>✓</p></td>
<td><p>✘</p></td>
<td><p>✓</p></td>
<td><p>?</p></td>
<td><p>✘</p></td>
<td><p>✓</p></td>
</tr>
<tr class="row-even"><td><p>scatter</p></td>
<td><p>✓</p></td>
<td><p>✘</p></td>
<td><p>✓</p></td>
<td><p>?</p></td>
<td><p>✘</p></td>
<td><p>✓</p></td>
</tr>
<tr class="row-odd"><td><p>reduce_scatter</p></td>
<td><p>✘</p></td>
<td><p>✘</p></td>
<td><p>✘</p></td>
<td><p>✘</p></td>
<td><p>✘</p></td>
<td><p>✓</p></td>
</tr>
<tr class="row-even"><td><p>all_to_all</p></td>
<td><p>✘</p></td>
<td><p>✘</p></td>
<td><p>✓</p></td>
<td><p>?</p></td>
<td><p>✘</p></td>
<td><p>✓</p></td>
</tr>
<tr class="row-odd"><td><p>barrier</p></td>
<td><p>✓</p></td>
<td><p>✘</p></td>
<td><p>✓</p></td>
<td><p>?</p></td>
<td><p>✘</p></td>
<td><p>✓</p></td>
</tr>
</tbody>
</table>
<div class="section" id="backends-that-come-with-pytorch">
<h3>Backends that come with PyTorch<a class="headerlink" href="#backends-that-come-with-pytorch" title="Permalink to this heading">¶</a></h3>
<p>PyTorch distributed package supports Linux (stable), MacOS (stable), and Windows (prototype).
By default for Linux, the Gloo and NCCL backends are built and included in PyTorch
distributed (NCCL only when building with CUDA). MPI is an optional backend that can only be
included if you build PyTorch from source. (e.g. building PyTorch on a host that has MPI
installed.)</p>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>As of PyTorch v1.8, Windows supports all collective communications backend but NCCL,
If  the <cite>init_method</cite> argument of <a class="reference internal" href="#torch.distributed.init_process_group" title="torch.distributed.init_process_group"><code class="xref py py-func docutils literal notranslate"><span class="pre">init_process_group()</span></code></a> points to a file it must adhere
to the following schema:</p>
<ul class="simple">
<li><p>Local file system, <code class="docutils literal notranslate"><span class="pre">init_method=&quot;file:///d:/tmp/some_file&quot;</span></code></p></li>
<li><p>Shared file system, <code class="docutils literal notranslate"><span class="pre">init_method=&quot;file://////{machine_name}/{share_folder_name}/some_file&quot;</span></code></p></li>
</ul>
<p>Same as on Linux platform, you can enable TcpStore by setting environment variables,
MASTER_ADDR and MASTER_PORT.</p>
</div>
</div>
<div class="section" id="which-backend-to-use">
<h3>Which backend to use?<a class="headerlink" href="#which-backend-to-use" title="Permalink to this heading">¶</a></h3>
<p>In the past, we were often asked: “which backend should I use?”.</p>
<ul class="simple">
<li><p>Rule of thumb</p>
<ul>
<li><p>Use the NCCL backend for distributed <strong>GPU</strong> training</p></li>
<li><p>Use the Gloo backend for distributed <strong>CPU</strong> training.</p></li>
</ul>
</li>
<li><p>GPU hosts with InfiniBand interconnect</p>
<ul>
<li><p>Use NCCL, since it’s the only backend that currently supports
InfiniBand and GPUDirect.</p></li>
</ul>
</li>
<li><p>GPU hosts with Ethernet interconnect</p>
<ul>
<li><p>Use NCCL, since it currently provides the best distributed GPU
training performance, especially for multiprocess single-node or
multi-node distributed training. If you encounter any problem with
NCCL, use Gloo as the fallback option. (Note that Gloo currently
runs slower than NCCL for GPUs.)</p></li>
</ul>
</li>
<li><p>CPU hosts with InfiniBand interconnect</p>
<ul>
<li><p>If your InfiniBand has enabled IP over IB, use Gloo, otherwise,
use MPI instead. We are planning on adding InfiniBand support for
Gloo in the upcoming releases.</p></li>
</ul>
</li>
<li><p>CPU hosts with Ethernet interconnect</p>
<ul>
<li><p>Use Gloo, unless you have specific reasons to use MPI.</p></li>
</ul>
</li>
</ul>
</div>
<div class="section" id="common-environment-variables">
<h3>Common environment variables<a class="headerlink" href="#common-environment-variables" title="Permalink to this heading">¶</a></h3>
<div class="section" id="choosing-the-network-interface-to-use">
<h4>Choosing the network interface to use<a class="headerlink" href="#choosing-the-network-interface-to-use" title="Permalink to this heading">¶</a></h4>
<p>By default, both the NCCL and Gloo backends will try to find the right network interface to use.
If the automatically detected interface is not correct, you can override it using the following
environment variables (applicable to the respective backend):</p>
<ul class="simple">
<li><p><strong>NCCL_SOCKET_IFNAME</strong>, for example <code class="docutils literal notranslate"><span class="pre">export</span> <span class="pre">NCCL_SOCKET_IFNAME=eth0</span></code></p></li>
<li><p><strong>GLOO_SOCKET_IFNAME</strong>, for example <code class="docutils literal notranslate"><span class="pre">export</span> <span class="pre">GLOO_SOCKET_IFNAME=eth0</span></code></p></li>
</ul>
<p>If you’re using the Gloo backend, you can specify multiple interfaces by separating
them by a comma, like this: <code class="docutils literal notranslate"><span class="pre">export</span> <span class="pre">GLOO_SOCKET_IFNAME=eth0,eth1,eth2,eth3</span></code>.
The backend will dispatch operations in a round-robin fashion across these interfaces.
It is imperative that all processes specify the same number of interfaces in this variable.</p>
</div>
<div class="section" id="other-nccl-environment-variables">
<h4>Other NCCL environment variables<a class="headerlink" href="#other-nccl-environment-variables" title="Permalink to this heading">¶</a></h4>
<p><strong>Debugging</strong> - in case of NCCL failure, you can set <code class="docutils literal notranslate"><span class="pre">NCCL_DEBUG=INFO</span></code> to print an explicit
warning message as well as basic NCCL initialization information.</p>
<p>You may also use <code class="docutils literal notranslate"><span class="pre">NCCL_DEBUG_SUBSYS</span></code> to get more details about a specific
aspect of NCCL. For example, <code class="docutils literal notranslate"><span class="pre">NCCL_DEBUG_SUBSYS=COLL</span></code> would print logs of
collective calls, which may be helpful when debugging hangs, especially those
caused by collective type or message size mismatch. In case of topology
detection failure, it would be helpful to set <code class="docutils literal notranslate"><span class="pre">NCCL_DEBUG_SUBSYS=GRAPH</span></code>
to inspect the detailed detection result and save as reference if further help
from NCCL team is needed.</p>
<p><strong>Performance tuning</strong> - NCCL performs automatic tuning based on its topology detection to save users’
tuning effort. On some socket-based systems, users may still try tuning
<code class="docutils literal notranslate"><span class="pre">NCCL_SOCKET_NTHREADS</span></code> and <code class="docutils literal notranslate"><span class="pre">NCCL_NSOCKS_PERTHREAD</span></code> to increase socket
network bandwidth. These two environment variables have been pre-tuned by NCCL
for some cloud providers, such as AWS or GCP.</p>
<p>For a full list of NCCL environment variables, please refer to
<a class="reference external" href="https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/env.html">NVIDIA NCCL’s official documentation</a></p>
</div>
</div>
</div>
<div class="section" id="basics">
<span id="distributed-basics"></span><h2>Basics<a class="headerlink" href="#basics" title="Permalink to this heading">¶</a></h2>
<p>The <cite>torch.distributed</cite> package provides PyTorch support and communication primitives
for multiprocess parallelism across several computation nodes running on one or more
machines. The class <a class="reference internal" href="generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel" title="torch.nn.parallel.DistributedDataParallel"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.nn.parallel.DistributedDataParallel()</span></code></a> builds on this
functionality to provide synchronous distributed training as a wrapper around any
PyTorch model. This differs from the kinds of parallelism provided by
<a class="reference internal" href="multiprocessing.html"><span class="doc">Multiprocessing package - torch.multiprocessing</span></a> and <a class="reference internal" href="generated/torch.nn.DataParallel.html#torch.nn.DataParallel" title="torch.nn.DataParallel"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.nn.DataParallel()</span></code></a> in that it supports
multiple network-connected machines and in that the user must explicitly launch a separate
copy of the main training script for each process.</p>
<p>In the single-machine synchronous case, <cite>torch.distributed</cite> or the
<a class="reference internal" href="generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel" title="torch.nn.parallel.DistributedDataParallel"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.nn.parallel.DistributedDataParallel()</span></code></a> wrapper may still have advantages over other
approaches to data-parallelism, including <a class="reference internal" href="generated/torch.nn.DataParallel.html#torch.nn.DataParallel" title="torch.nn.DataParallel"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.nn.DataParallel()</span></code></a>:</p>
<ul class="simple">
<li><p>Each process maintains its own optimizer and performs a complete optimization step with each
iteration. While this may appear redundant, since the gradients have already been gathered
together and averaged across processes and are thus the same for every process, this means
that no parameter broadcast step is needed, reducing time spent transferring tensors between
nodes.</p></li>
<li><p>Each process contains an independent Python interpreter, eliminating the extra interpreter
overhead and “GIL-thrashing” that comes from driving several execution threads, model
replicas, or GPUs from a single Python process. This is especially important for models that
make heavy use of the Python runtime, including models with recurrent layers or many small
components.</p></li>
</ul>
</div>
<div class="section" id="initialization">
<h2>Initialization<a class="headerlink" href="#initialization" title="Permalink to this heading">¶</a></h2>
<p>The package needs to be initialized using the <a class="reference internal" href="#torch.distributed.init_process_group" title="torch.distributed.init_process_group"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.distributed.init_process_group()</span></code></a>
or <a class="reference internal" href="#torch.distributed.device_mesh.init_device_mesh" title="torch.distributed.device_mesh.init_device_mesh"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.distributed.device_mesh.init_device_mesh()</span></code></a> function before calling any other methods.
Both block until all processes have joined.</p>
<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.is_available">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">is_available</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed.html#is_available"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.is_available" title="Permalink to this definition">¶</a></dt>
<dd><p>Return <code class="docutils literal notranslate"><span class="pre">True</span></code> if the distributed package is available.</p>
<p>Otherwise,
<code class="docutils literal notranslate"><span class="pre">torch.distributed</span></code> does not expose any other APIs. Currently,
<code class="docutils literal notranslate"><span class="pre">torch.distributed</span></code> is available on Linux, MacOS and Windows. Set
<code class="docutils literal notranslate"><span class="pre">USE_DISTRIBUTED=1</span></code> to enable it when building PyTorch from source.
Currently, the default value is <code class="docutils literal notranslate"><span class="pre">USE_DISTRIBUTED=1</span></code> for Linux and Windows,
<code class="docutils literal notranslate"><span class="pre">USE_DISTRIBUTED=0</span></code> for MacOS.</p>
<dl class="field-list simple">
<dt class="field-odd">Return type</dt>
<dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)">bool</a></p>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.init_process_group">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">init_process_group</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">backend</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">init_method</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">timeout</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">world_size</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">-1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">rank</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">-1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">store</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">group_name</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">''</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pg_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">device_id</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#init_process_group"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.init_process_group" title="Permalink to this definition">¶</a></dt>
<dd><p>Initialize the default distributed process group.</p>
<p>This will also initialize the distributed package.</p>
<dl class="simple">
<dt>There are 2 main ways to initialize a process group:</dt><dd><ol class="arabic simple">
<li><p>Specify <code class="docutils literal notranslate"><span class="pre">store</span></code>, <code class="docutils literal notranslate"><span class="pre">rank</span></code>, and <code class="docutils literal notranslate"><span class="pre">world_size</span></code> explicitly.</p></li>
<li><p>Specify <code class="docutils literal notranslate"><span class="pre">init_method</span></code> (a URL string) which indicates where/how
to discover peers. Optionally specify <code class="docutils literal notranslate"><span class="pre">rank</span></code> and <code class="docutils literal notranslate"><span class="pre">world_size</span></code>,
or encode all required parameters in the URL and omit them.</p></li>
</ol>
</dd>
</dl>
<p>If neither is specified, <code class="docutils literal notranslate"><span class="pre">init_method</span></code> is assumed to be “env://”.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>backend</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em> or </em><a class="reference internal" href="#torch.distributed.Backend" title="torch.distributed.Backend"><em>Backend</em></a><em>, </em><em>optional</em>) – The backend to use. Depending on
build-time configurations, valid values include <code class="docutils literal notranslate"><span class="pre">mpi</span></code>, <code class="docutils literal notranslate"><span class="pre">gloo</span></code>,
<code class="docutils literal notranslate"><span class="pre">nccl</span></code>, and <code class="docutils literal notranslate"><span class="pre">ucc</span></code>. If the backend is not provided, then both a <code class="docutils literal notranslate"><span class="pre">gloo</span></code>
and <code class="docutils literal notranslate"><span class="pre">nccl</span></code> backend will be created, see notes below for how multiple
backends are managed. This field can be given as a lowercase string
(e.g., <code class="docutils literal notranslate"><span class="pre">&quot;gloo&quot;</span></code>), which can also be accessed via
<a class="reference internal" href="#torch.distributed.Backend" title="torch.distributed.Backend"><code class="xref py py-class docutils literal notranslate"><span class="pre">Backend</span></code></a> attributes (e.g., <code class="docutils literal notranslate"><span class="pre">Backend.GLOO</span></code>). If using
multiple processes per machine with <code class="docutils literal notranslate"><span class="pre">nccl</span></code> backend, each process
must have exclusive access to every GPU it uses, as sharing GPUs
between processes can result in deadlocks. <code class="docutils literal notranslate"><span class="pre">ucc</span></code> backend is
experimental.</p></li>
<li><p><strong>init_method</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>optional</em>) – URL specifying how to initialize the
process group. Default is “env://” if no
<code class="docutils literal notranslate"><span class="pre">init_method</span></code> or <code class="docutils literal notranslate"><span class="pre">store</span></code> is specified.
Mutually exclusive with <code class="docutils literal notranslate"><span class="pre">store</span></code>.</p></li>
<li><p><strong>world_size</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>optional</em>) – Number of processes participating in
the job. Required if <code class="docutils literal notranslate"><span class="pre">store</span></code> is specified.</p></li>
<li><p><strong>rank</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>optional</em>) – Rank of the current process (it should be a
number between 0 and <code class="docutils literal notranslate"><span class="pre">world_size</span></code>-1).
Required if <code class="docutils literal notranslate"><span class="pre">store</span></code> is specified.</p></li>
<li><p><strong>store</strong> (<a class="reference internal" href="#torch.distributed.Store" title="torch.distributed.Store"><em>Store</em></a><em>, </em><em>optional</em>) – Key/value store accessible to all workers, used
to exchange connection/address information.
Mutually exclusive with <code class="docutils literal notranslate"><span class="pre">init_method</span></code>.</p></li>
<li><p><strong>timeout</strong> (<em>timedelta</em><em>, </em><em>optional</em>) – Timeout for operations executed against
the process group. Default value is 10 minutes for NCCL and 30 minutes for other backends.
This is the duration after which collectives will be aborted asynchronously and the process will crash.
This is done since CUDA execution is async and it is no longer safe to continue executing user code since
failed async NCCL operations might result in subsequent CUDA operations running on corrupted data.
When TORCH_NCCL_BLOCKING_WAIT is set, the process will block and wait for this timeout.</p></li>
<li><p><strong>group_name</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>optional</em><em>, </em><em>deprecated</em>) – Group name. This argument is ignored</p></li>
<li><p><strong>pg_options</strong> (<em>ProcessGroupOptions</em><em>, </em><em>optional</em>) – process group options
specifying what additional options need to be passed in during
the construction of specific process groups. As of now, the only
options we support is <code class="docutils literal notranslate"><span class="pre">ProcessGroupNCCL.Options</span></code> for the <code class="docutils literal notranslate"><span class="pre">nccl</span></code>
backend, <code class="docutils literal notranslate"><span class="pre">is_high_priority_stream</span></code> can be specified so that
the nccl backend can pick up high priority cuda streams when
there’re compute kernels waiting. For other availble options to config nccl,
See <a class="reference external" href="https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/types.html#ncclconfig-t">https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/types.html#ncclconfig-t</a></p></li>
<li><p><strong>device_id</strong> (<a class="reference internal" href="tensor_attributes.html#torch.device" title="torch.device"><em>torch.device</em></a><em>, </em><em>optional</em>) – a single, specific device
to “bind” this process to, allowing for backend-specific
optimizations.  Currently this has two effects, only under
NCCL: the communicator is immediately formed (calling
<code class="docutils literal notranslate"><span class="pre">ncclCommInit*</span></code> immediately rather than the normal lazy
call) and sub-groups will use <code class="docutils literal notranslate"><span class="pre">ncclCommSplit</span></code> when
possible to avoid unnecessary overhead of group creation. If you
want to know NCCL initialization error early, you can also use this
field.</p></li>
</ul>
</dd>
</dl>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>To enable <code class="docutils literal notranslate"><span class="pre">backend</span> <span class="pre">==</span> <span class="pre">Backend.MPI</span></code>, PyTorch needs to be built from source
on a system that supports MPI.</p>
</div>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>Support for multiple backends is experimental. Currently when no backend is
specified, both <code class="docutils literal notranslate"><span class="pre">gloo</span></code> and <code class="docutils literal notranslate"><span class="pre">nccl</span></code> backends will be created. The <code class="docutils literal notranslate"><span class="pre">gloo</span></code> backend
will be used for collectives with CPU tensors and the <code class="docutils literal notranslate"><span class="pre">nccl</span></code> backend will be used
for collectives with CUDA tensors. A custom backend can be specified by passing in
a string with format “&lt;device_type&gt;:&lt;backend_name&gt;,&lt;device_type&gt;:&lt;backend_name&gt;”, e.g.
“cpu:gloo,cuda:custom_backend”.</p>
</div>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.device_mesh.init_device_mesh">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.device_mesh.</span></span><span class="sig-name descname"><span class="pre">init_device_mesh</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">device_type</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">mesh_shape</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">mesh_dim_names</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/device_mesh.html#init_device_mesh"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.device_mesh.init_device_mesh" title="Permalink to this definition">¶</a></dt>
<dd><p>Initializes a <cite>DeviceMesh</cite> based on <cite>device_type</cite>, <cite>mesh_shape</cite>, and <cite>mesh_dim_names</cite> parameters.</p>
<p>This creates a DeviceMesh with an n-dimensional array layout, where <cite>n</cite> is the length of <cite>mesh_shape</cite>.
If <cite>mesh_dim_names</cite> is provided, each dimension is labeled as <cite>mesh_dim_names[i]</cite>.</p>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p><cite>init_device_mesh</cite> follows SPMD programming model, meaning the same PyTorch Python program
runs on all processes/ranks in the cluster. Ensure <cite>mesh_shape</cite> (the dimensions of the nD array
describing device layout) is identical across all ranks. Inconsistent <cite>mesh_shape</cite> may lead to hanging.</p>
</div>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>If no process group is found, init_device_mesh will initialize distributed process group/groups
required for distributed communications behind the scene.</p>
</div>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>device_type</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a>) – The device type of the mesh. Currently supports: “cpu”, “cuda/cuda-like”.
Passing in a device type with a GPU index, such as “cuda:0”, is not allowed.</p></li>
<li><p><strong>mesh_shape</strong> (<em>Tuple</em><em>[</em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>]</em>) – A tuple defining the dimensions of the multi-dimensional array
describing the layout of devices.</p></li>
<li><p><strong>mesh_dim_names</strong> (<em>Tuple</em><em>[</em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>]</em><em>, </em><em>optional</em>) – A tuple of mesh dimension names to assign to each dimension
of the multi-dimensional array describing the layout of devices. Its length must match the length
of <cite>mesh_shape</cite>. Each string in <cite>mesh_dim_names</cite> must be unique.</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>A <a class="reference internal" href="#torch.distributed.device_mesh.DeviceMesh" title="torch.distributed.device_mesh.DeviceMesh"><code class="xref py py-class docutils literal notranslate"><span class="pre">DeviceMesh</span></code></a> object representing the device layout.</p>
</dd>
<dt class="field-odd">Return type</dt>
<dd class="field-odd"><p><a class="reference internal" href="#torch.distributed.device_mesh.DeviceMesh" title="torch.distributed.device_mesh.DeviceMesh">DeviceMesh</a></p>
</dd>
</dl>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">torch.distributed.device_mesh</span> <span class="kn">import</span> <span class="n">init_device_mesh</span>
<span class="go">&gt;&gt;&gt;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mesh_1d</span> <span class="o">=</span> <span class="n">init_device_mesh</span><span class="p">(</span><span class="s2">&quot;cuda&quot;</span><span class="p">,</span> <span class="n">mesh_shape</span><span class="o">=</span><span class="p">(</span><span class="mi">8</span><span class="p">,))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mesh_2d</span> <span class="o">=</span> <span class="n">init_device_mesh</span><span class="p">(</span><span class="s2">&quot;cuda&quot;</span><span class="p">,</span> <span class="n">mesh_shape</span><span class="o">=</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">8</span><span class="p">),</span> <span class="n">mesh_dim_names</span><span class="o">=</span><span class="p">(</span><span class="s2">&quot;dp&quot;</span><span class="p">,</span> <span class="s2">&quot;tp&quot;</span><span class="p">))</span>
</pre></div>
</div>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.is_initialized">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">is_initialized</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#is_initialized"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.is_initialized" title="Permalink to this definition">¶</a></dt>
<dd><p>Check if the default process group has been initialized.</p>
<dl class="field-list simple">
<dt class="field-odd">Return type</dt>
<dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)">bool</a></p>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.is_mpi_available">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">is_mpi_available</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#is_mpi_available"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.is_mpi_available" title="Permalink to this definition">¶</a></dt>
<dd><p>Check if the MPI backend is available.</p>
<dl class="field-list simple">
<dt class="field-odd">Return type</dt>
<dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)">bool</a></p>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.is_nccl_available">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">is_nccl_available</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#is_nccl_available"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.is_nccl_available" title="Permalink to this definition">¶</a></dt>
<dd><p>Check if the NCCL backend is available.</p>
<dl class="field-list simple">
<dt class="field-odd">Return type</dt>
<dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)">bool</a></p>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.is_gloo_available">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">is_gloo_available</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#is_gloo_available"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.is_gloo_available" title="Permalink to this definition">¶</a></dt>
<dd><p>Check if the Gloo backend is available.</p>
<dl class="field-list simple">
<dt class="field-odd">Return type</dt>
<dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)">bool</a></p>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.is_torchelastic_launched">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">is_torchelastic_launched</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#is_torchelastic_launched"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.is_torchelastic_launched" title="Permalink to this definition">¶</a></dt>
<dd><p>Check whether this process was launched with <code class="docutils literal notranslate"><span class="pre">torch.distributed.elastic</span></code> (aka torchelastic).</p>
<p>The existence of <code class="docutils literal notranslate"><span class="pre">TORCHELASTIC_RUN_ID</span></code> environment
variable is used as a proxy to determine whether the current process
was launched with torchelastic. This is a reasonable proxy since
<code class="docutils literal notranslate"><span class="pre">TORCHELASTIC_RUN_ID</span></code> maps to the rendezvous id which is always a
non-null value indicating the job id for peer discovery purposes..</p>
<dl class="field-list simple">
<dt class="field-odd">Return type</dt>
<dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)">bool</a></p>
</dd>
</dl>
</dd></dl>

<hr class="docutils" />
<p>Currently three initialization methods are supported:</p>
<div class="section" id="tcp-initialization">
<h3>TCP initialization<a class="headerlink" href="#tcp-initialization" title="Permalink to this heading">¶</a></h3>
<p>There are two ways to initialize using TCP, both requiring a network address
reachable from all processes and a desired <code class="docutils literal notranslate"><span class="pre">world_size</span></code>. The first way
requires specifying an address that belongs to the rank 0 process. This
initialization method requires that all processes have manually specified ranks.</p>
<p>Note that multicast address is not supported anymore in the latest distributed
package. <code class="docutils literal notranslate"><span class="pre">group_name</span></code> is deprecated as well.</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">torch.distributed</span> <span class="k">as</span> <span class="nn">dist</span>

<span class="c1"># Use address of one of the machines</span>
<span class="n">dist</span><span class="o">.</span><span class="n">init_process_group</span><span class="p">(</span><span class="n">backend</span><span class="p">,</span> <span class="n">init_method</span><span class="o">=</span><span class="s1">&#39;tcp://10.1.1.20:23456&#39;</span><span class="p">,</span>
                        <span class="n">rank</span><span class="o">=</span><span class="n">args</span><span class="o">.</span><span class="n">rank</span><span class="p">,</span> <span class="n">world_size</span><span class="o">=</span><span class="mi">4</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="section" id="shared-file-system-initialization">
<h3>Shared file-system initialization<a class="headerlink" href="#shared-file-system-initialization" title="Permalink to this heading">¶</a></h3>
<p>Another initialization method makes use of a file system that is shared and
visible from all machines in a group, along with a desired <code class="docutils literal notranslate"><span class="pre">world_size</span></code>. The URL should start
with <code class="docutils literal notranslate"><span class="pre">file://</span></code> and contain a path to a non-existent file (in an existing
directory) on a shared file system. File-system initialization will automatically
create that file if it doesn’t exist, but will not delete the file. Therefore, it
is your responsibility to make sure that the file is cleaned up before the next
<a class="reference internal" href="#torch.distributed.init_process_group" title="torch.distributed.init_process_group"><code class="xref py py-func docutils literal notranslate"><span class="pre">init_process_group()</span></code></a> call on the same file path/name.</p>
<p>Note that automatic rank assignment is not supported anymore in the latest
distributed package and <code class="docutils literal notranslate"><span class="pre">group_name</span></code> is deprecated as well.</p>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>This method assumes that the file system supports locking using <code class="docutils literal notranslate"><span class="pre">fcntl</span></code> - most
local systems and NFS support it.</p>
</div>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>This method will always create the file and try its best to clean up and remove
the file at the end of the program. In other words, each initialization with
the file init method will need a brand new empty file in order for the initialization
to succeed. If the same file used by the previous initialization (which happens not
to get cleaned up) is used again, this is unexpected behavior and can often cause
deadlocks and failures. Therefore, even though this method will try its best to clean up
the file, if the auto-delete happens to be unsuccessful, it is your responsibility
to ensure that the file is removed at the end of the training to prevent the same
file to be reused again during the next time. This is especially important
if you plan to call <a class="reference internal" href="#torch.distributed.init_process_group" title="torch.distributed.init_process_group"><code class="xref py py-func docutils literal notranslate"><span class="pre">init_process_group()</span></code></a> multiple times on the same file name.
In other words, if the file is not removed/cleaned up and you call
<a class="reference internal" href="#torch.distributed.init_process_group" title="torch.distributed.init_process_group"><code class="xref py py-func docutils literal notranslate"><span class="pre">init_process_group()</span></code></a> again on that file, failures are expected.
The rule of thumb here is that, make sure that the file is non-existent or
empty every time <a class="reference internal" href="#torch.distributed.init_process_group" title="torch.distributed.init_process_group"><code class="xref py py-func docutils literal notranslate"><span class="pre">init_process_group()</span></code></a> is called.</p>
</div>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">torch.distributed</span> <span class="k">as</span> <span class="nn">dist</span>

<span class="c1"># rank should always be specified</span>
<span class="n">dist</span><span class="o">.</span><span class="n">init_process_group</span><span class="p">(</span><span class="n">backend</span><span class="p">,</span> <span class="n">init_method</span><span class="o">=</span><span class="s1">&#39;file:///mnt/nfs/sharedfile&#39;</span><span class="p">,</span>
                        <span class="n">world_size</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">rank</span><span class="o">=</span><span class="n">args</span><span class="o">.</span><span class="n">rank</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="section" id="environment-variable-initialization">
<h3>Environment variable initialization<a class="headerlink" href="#environment-variable-initialization" title="Permalink to this heading">¶</a></h3>
<p>This method will read the configuration from environment variables, allowing
one to fully customize how the information is obtained. The variables to be set
are:</p>
<ul class="simple">
<li><p><code class="docutils literal notranslate"><span class="pre">MASTER_PORT</span></code> - required; has to be a free port on machine with rank 0</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">MASTER_ADDR</span></code> - required (except for rank 0); address of rank 0 node</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">WORLD_SIZE</span></code> - required; can be set either here, or in a call to init function</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">RANK</span></code> - required; can be set either here, or in a call to init function</p></li>
</ul>
<p>The machine with rank 0 will be used to set up all connections.</p>
<p>This is the default method, meaning that <code class="docutils literal notranslate"><span class="pre">init_method</span></code> does not have to be specified (or
can be <code class="docutils literal notranslate"><span class="pre">env://</span></code>).</p>
</div>
</div>
<div class="section" id="post-initialization">
<h2>Post-Initialization<a class="headerlink" href="#post-initialization" title="Permalink to this heading">¶</a></h2>
<p>Once <a class="reference internal" href="#torch.distributed.init_process_group" title="torch.distributed.init_process_group"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.distributed.init_process_group()</span></code></a> was run, the following functions can be used. To
check whether the process group has already been initialized use <a class="reference internal" href="#torch.distributed.is_initialized" title="torch.distributed.is_initialized"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.distributed.is_initialized()</span></code></a>.</p>
<dl class="py class">
<dt class="sig sig-object py" id="torch.distributed.Backend">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">Backend</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">name</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#Backend"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.Backend" title="Permalink to this definition">¶</a></dt>
<dd><p>An enum-like class for backends.</p>
<p>Available backends: GLOO, NCCL, UCC, MPI, and other registered backends.</p>
<p>The values of this class are lowercase strings, e.g., <code class="docutils literal notranslate"><span class="pre">&quot;gloo&quot;</span></code>. They can
be accessed as attributes, e.g., <code class="docutils literal notranslate"><span class="pre">Backend.NCCL</span></code>.</p>
<p>This class can be directly called to parse the string, e.g.,
<code class="docutils literal notranslate"><span class="pre">Backend(backend_str)</span></code> will check if <code class="docutils literal notranslate"><span class="pre">backend_str</span></code> is valid, and
return the parsed lowercase string if so. It also accepts uppercase strings,
e.g., <code class="docutils literal notranslate"><span class="pre">Backend(&quot;GLOO&quot;)</span></code> returns <code class="docutils literal notranslate"><span class="pre">&quot;gloo&quot;</span></code>.</p>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>The entry <code class="docutils literal notranslate"><span class="pre">Backend.UNDEFINED</span></code> is present but only used as
initial value of some fields. Users should neither use it directly
nor assume its existence.</p>
</div>
<dl class="field-list simple">
</dl>
<dl class="py method">
<dt class="sig sig-object py" id="torch.distributed.Backend.register_backend">
<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">register_backend</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">func</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">extended_api</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">devices</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#Backend.register_backend"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.Backend.register_backend" title="Permalink to this definition">¶</a></dt>
<dd><p>Register a new backend with the given name and instantiating function.</p>
<p>This class method is used by 3rd party <code class="docutils literal notranslate"><span class="pre">ProcessGroup</span></code> extension to
register new backends.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>name</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a>) – Backend name of the <code class="docutils literal notranslate"><span class="pre">ProcessGroup</span></code> extension. It
should match the one in <code class="docutils literal notranslate"><span class="pre">init_process_group()</span></code>.</p></li>
<li><p><strong>func</strong> (<em>function</em>) – Function handler that instantiates the backend.
The function should be implemented in the backend
extension and takes four arguments, including
<code class="docutils literal notranslate"><span class="pre">store</span></code>, <code class="docutils literal notranslate"><span class="pre">rank</span></code>, <code class="docutils literal notranslate"><span class="pre">world_size</span></code>, and <code class="docutils literal notranslate"><span class="pre">timeout</span></code>.</p></li>
<li><p><strong>extended_api</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>optional</em>) – Whether the backend supports extended argument structure.
Default: <code class="docutils literal notranslate"><span class="pre">False</span></code>. If set to <code class="docutils literal notranslate"><span class="pre">True</span></code>, the backend
will get an instance of <code class="docutils literal notranslate"><span class="pre">c10d::DistributedBackendOptions</span></code>, and
a process group options object as defined by the backend implementation.</p></li>
<li><p><strong>device</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em> or </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em> of </em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>optional</em>) – device type this backend
supports, e.g. “cpu”, “cuda”, etc. If <cite>None</cite>,
assuming both “cpu” and “cuda”</p></li>
</ul>
</dd>
</dl>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>This support of 3rd party backend is experimental and subject to change.</p>
</div>
</dd></dl>

</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.get_backend">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">get_backend</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">group</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#get_backend"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.get_backend" title="Permalink to this definition">¶</a></dt>
<dd><p>Return the backend of the given process group.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>group</strong> (<em>ProcessGroup</em><em>, </em><em>optional</em>) – The process group to work on. The
default is the general main process group. If another specific group
is specified, the calling process must be part of <code class="xref py py-attr docutils literal notranslate"><span class="pre">group</span></code>.</p>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>The backend of the given process group as a lower case string.</p>
</dd>
<dt class="field-odd">Return type</dt>
<dd class="field-odd"><p><a class="reference internal" href="#torch.distributed.Backend" title="torch.distributed.distributed_c10d.Backend"><em>Backend</em></a></p>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.get_rank">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">get_rank</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">group</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#get_rank"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.get_rank" title="Permalink to this definition">¶</a></dt>
<dd><p>Return the rank of the current process in the provided <code class="docutils literal notranslate"><span class="pre">group</span></code>, default otherwise.</p>
<p>Rank is a unique identifier assigned to each process within a distributed
process group. They are always consecutive integers ranging from 0 to
<code class="docutils literal notranslate"><span class="pre">world_size</span></code>.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>group</strong> (<em>ProcessGroup</em><em>, </em><em>optional</em>) – The process group to work on. If None,
the default process group will be used.</p>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>The rank of the process group
-1, if not part of the group</p>
</dd>
<dt class="field-odd">Return type</dt>
<dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)">int</a></p>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.get_world_size">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">get_world_size</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">group</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#get_world_size"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.get_world_size" title="Permalink to this definition">¶</a></dt>
<dd><p>Return the number of processes in the current process group.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>group</strong> (<em>ProcessGroup</em><em>, </em><em>optional</em>) – The process group to work on. If None,
the default process group will be used.</p>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>The world size of the process group
-1, if not part of the group</p>
</dd>
<dt class="field-odd">Return type</dt>
<dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)">int</a></p>
</dd>
</dl>
</dd></dl>

</div>
<div class="section" id="shutdown">
<h2>Shutdown<a class="headerlink" href="#shutdown" title="Permalink to this heading">¶</a></h2>
<p>It is important to clean up resources on exit by calling <code class="xref py py-func docutils literal notranslate"><span class="pre">destroy_process_group()</span></code>.</p>
<p>The simplest pattern to follow is to destroy every process group and backend by calling
<code class="xref py py-func docutils literal notranslate"><span class="pre">destroy_process_group()</span></code> with the default value of None for the <cite>group</cite> argument, at a
point in the training script where communications are no longer needed, usually near the
end of main().  The call should be made once per trainer-process, not at the outer
process-launcher level.</p>
<p>if <code class="xref py py-func docutils literal notranslate"><span class="pre">destroy_process_group()</span></code> is not called by all ranks in a pg within the timeout duration,
especially when there are multiple process-groups in the application e.g. for N-D parallelism,
hangs on exit are possible.  This is because the destructor for ProcessGroupNCCL calls ncclCommAbort,
which must be called collectively, but the order of calling ProcessGroupNCCL’s destructor if called
by python’s GC is not deterministic. Calling <code class="xref py py-func docutils literal notranslate"><span class="pre">destroy_process_group()</span></code> helps by ensuring
ncclCommAbort is called in a consistent order across ranks, and avoids calling ncclCommAbort
during ProcessGroupNCCL’s destructor.</p>
<div class="section" id="reinitialization">
<h3>Reinitialization<a class="headerlink" href="#reinitialization" title="Permalink to this heading">¶</a></h3>
<p><cite>destroy_process_group</cite> can also be used to destroy individual process groups.  One use
case could be fault tolerant training, where a process group may be destroyed and then
a new one initialized during runtime.  In this case, it’s critical to synchronize the trainer
processes using some means other than torch.distributed primitives _after_ calling destroy and
before subsequently initializing.  This behavior is currently unsupported/untested, due to
the difficulty of achieving this synchronization, and is considered a known issue.  Please file
a github issue or RFC if this is a use case that’s blocking you.</p>
</div>
</div>
<hr class="docutils" />
<div class="section" id="distributed-key-value-store">
<h2>Distributed Key-Value Store<a class="headerlink" href="#distributed-key-value-store" title="Permalink to this heading">¶</a></h2>
<p>The distributed package comes with a distributed key-value store, which can be
used to share information between processes in the group as well as to
initialize the distributed package in
<a class="reference internal" href="#torch.distributed.init_process_group" title="torch.distributed.init_process_group"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.distributed.init_process_group()</span></code></a> (by explicitly creating the store
as an alternative to specifying <code class="docutils literal notranslate"><span class="pre">init_method</span></code>.) There are 3 choices for
Key-Value Stores: <a class="reference internal" href="#torch.distributed.TCPStore" title="torch.distributed.TCPStore"><code class="xref py py-class docutils literal notranslate"><span class="pre">TCPStore</span></code></a>,
<a class="reference internal" href="#torch.distributed.FileStore" title="torch.distributed.FileStore"><code class="xref py py-class docutils literal notranslate"><span class="pre">FileStore</span></code></a>, and <a class="reference internal" href="#torch.distributed.HashStore" title="torch.distributed.HashStore"><code class="xref py py-class docutils literal notranslate"><span class="pre">HashStore</span></code></a>.</p>
<dl class="py class">
<dt class="sig sig-object py" id="torch.distributed.Store">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">Store</span></span><a class="headerlink" href="#torch.distributed.Store" title="Permalink to this definition">¶</a></dt>
<dd><p>Base class for all store implementations, such as the 3 provided by PyTorch
distributed: (<a class="reference internal" href="#torch.distributed.TCPStore" title="torch.distributed.TCPStore"><code class="xref py py-class docutils literal notranslate"><span class="pre">TCPStore</span></code></a>, <a class="reference internal" href="#torch.distributed.FileStore" title="torch.distributed.FileStore"><code class="xref py py-class docutils literal notranslate"><span class="pre">FileStore</span></code></a>,
and <a class="reference internal" href="#torch.distributed.HashStore" title="torch.distributed.HashStore"><code class="xref py py-class docutils literal notranslate"><span class="pre">HashStore</span></code></a>).</p>
</dd></dl>

<dl class="py class">
<dt class="sig sig-object py" id="torch.distributed.TCPStore">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">TCPStore</span></span><a class="headerlink" href="#torch.distributed.TCPStore" title="Permalink to this definition">¶</a></dt>
<dd><p>A TCP-based distributed key-value store implementation. The server store holds
the data, while the client stores can connect to the server store over TCP and
perform actions such as <code class="xref py py-meth docutils literal notranslate"><span class="pre">set()</span></code> to insert a key-value
pair, <code class="xref py py-meth docutils literal notranslate"><span class="pre">get()</span></code> to retrieve a key-value pair, etc. There
should always be one server store initialized because the client store(s) will wait for
the server to establish a connection.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>host_name</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a>) – The hostname or IP Address the server store should run on.</p></li>
<li><p><strong>port</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a>) – The port on which the server store should listen for incoming requests.</p></li>
<li><p><strong>world_size</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>optional</em>) – The total number of store users (number of clients + 1 for the server). Default is None (None indicates a non-fixed number of store users).</p></li>
<li><p><strong>is_master</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>optional</em>) – True when initializing the server store and False for client stores. Default is False.</p></li>
<li><p><strong>timeout</strong> (<em>timedelta</em><em>, </em><em>optional</em>) – Timeout used by the store during initialization and for methods such as <code class="xref py py-meth docutils literal notranslate"><span class="pre">get()</span></code> and <code class="xref py py-meth docutils literal notranslate"><span class="pre">wait()</span></code>. Default is timedelta(seconds=300)</p></li>
<li><p><strong>wait_for_workers</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>optional</em>) – Whether to wait for all the workers to connect with the server store. This is only applicable when world_size is a fixed value. Default is True.</p></li>
<li><p><strong>multi_tenant</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>optional</em>) – If True, all <code class="docutils literal notranslate"><span class="pre">TCPStore</span></code> instances in the current process with the same host/port will use the same underlying <code class="docutils literal notranslate"><span class="pre">TCPServer</span></code>. Default is False.</p></li>
<li><p><strong>master_listen_fd</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>optional</em>) – If specified, the underlying <code class="docutils literal notranslate"><span class="pre">TCPServer</span></code> will listen on this file descriptor, which must be a socket already bound to <code class="docutils literal notranslate"><span class="pre">port</span></code>. Useful to avoid port assignment races in some scenarios. Default is None (meaning the server creates a new socket and attempts to bind it to <code class="docutils literal notranslate"><span class="pre">port</span></code>).</p></li>
<li><p><strong>use_libuv</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>optional</em>) – If True, use libuv for <code class="docutils literal notranslate"><span class="pre">TCPServer</span></code> backend. Default is True.</p></li>
</ul>
</dd>
</dl>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">torch.distributed</span> <span class="k">as</span> <span class="nn">dist</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">timedelta</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Run on process 1 (server)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">server_store</span> <span class="o">=</span> <span class="n">dist</span><span class="o">.</span><span class="n">TCPStore</span><span class="p">(</span><span class="s2">&quot;127.0.0.1&quot;</span><span class="p">,</span> <span class="mi">1234</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="kc">True</span><span class="p">,</span> <span class="n">timedelta</span><span class="p">(</span><span class="n">seconds</span><span class="o">=</span><span class="mi">30</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Run on process 2 (client)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">client_store</span> <span class="o">=</span> <span class="n">dist</span><span class="o">.</span><span class="n">TCPStore</span><span class="p">(</span><span class="s2">&quot;127.0.0.1&quot;</span><span class="p">,</span> <span class="mi">1234</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="kc">False</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Use any of the store methods from either the client or server after initialization</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">server_store</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s2">&quot;first_key&quot;</span><span class="p">,</span> <span class="s2">&quot;first_value&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">client_store</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;first_key&quot;</span><span class="p">)</span>
</pre></div>
</div>
</dd>
</dl>
</dd></dl>

<dl class="py class">
<dt class="sig sig-object py" id="torch.distributed.HashStore">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">HashStore</span></span><a class="headerlink" href="#torch.distributed.HashStore" title="Permalink to this definition">¶</a></dt>
<dd><p>A thread-safe store implementation based on an underlying hashmap. This store can be used
within the same process (for example, by other threads), but cannot be used across processes.</p>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">torch.distributed</span> <span class="k">as</span> <span class="nn">dist</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span> <span class="o">=</span> <span class="n">dist</span><span class="o">.</span><span class="n">HashStore</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># store can be used from other threads</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Use any of the store methods after initialization</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s2">&quot;first_key&quot;</span><span class="p">,</span> <span class="s2">&quot;first_value&quot;</span><span class="p">)</span>
</pre></div>
</div>
</dd>
</dl>
</dd></dl>

<dl class="py class">
<dt class="sig sig-object py" id="torch.distributed.FileStore">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">FileStore</span></span><a class="headerlink" href="#torch.distributed.FileStore" title="Permalink to this definition">¶</a></dt>
<dd><p>A store implementation that uses a file to store the underlying key-value pairs.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>file_name</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a>) – path of the file in which to store the key-value pairs</p></li>
<li><p><strong>world_size</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>optional</em>) – The total number of processes using the store. Default is -1 (a negative value indicates a non-fixed number of store users).</p></li>
</ul>
</dd>
</dl>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">torch.distributed</span> <span class="k">as</span> <span class="nn">dist</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store1</span> <span class="o">=</span> <span class="n">dist</span><span class="o">.</span><span class="n">FileStore</span><span class="p">(</span><span class="s2">&quot;/tmp/filestore&quot;</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store2</span> <span class="o">=</span> <span class="n">dist</span><span class="o">.</span><span class="n">FileStore</span><span class="p">(</span><span class="s2">&quot;/tmp/filestore&quot;</span><span class="p">,</span> <span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Use any of the store methods from either the client or server after initialization</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store1</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s2">&quot;first_key&quot;</span><span class="p">,</span> <span class="s2">&quot;first_value&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store2</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;first_key&quot;</span><span class="p">)</span>
</pre></div>
</div>
</dd>
</dl>
</dd></dl>

<dl class="py class">
<dt class="sig sig-object py" id="torch.distributed.PrefixStore">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">PrefixStore</span></span><a class="headerlink" href="#torch.distributed.PrefixStore" title="Permalink to this definition">¶</a></dt>
<dd><p>A wrapper around any of the 3 key-value stores (<a class="reference internal" href="#torch.distributed.TCPStore" title="torch.distributed.TCPStore"><code class="xref py py-class docutils literal notranslate"><span class="pre">TCPStore</span></code></a>,
<a class="reference internal" href="#torch.distributed.FileStore" title="torch.distributed.FileStore"><code class="xref py py-class docutils literal notranslate"><span class="pre">FileStore</span></code></a>, and <a class="reference internal" href="#torch.distributed.HashStore" title="torch.distributed.HashStore"><code class="xref py py-class docutils literal notranslate"><span class="pre">HashStore</span></code></a>)
that adds a prefix to each key inserted to the store.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>prefix</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a>) – The prefix string that is prepended to each key before being inserted into the store.</p></li>
<li><p><strong>store</strong> (<em>torch.distributed.store</em>) – A store object that forms the underlying key-value store.</p></li>
</ul>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.Store.set">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.Store.</span></span><span class="sig-name descname"><span class="pre">set</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">self</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">torch._C._distributed_c10d.Store</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">arg0</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><span class="pre">str</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">arg1</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><span class="pre">str</span></a></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.13)"><span class="pre">None</span></a></span></span><a class="headerlink" href="#torch.distributed.Store.set" title="Permalink to this definition">¶</a></dt>
<dd><p>Inserts the key-value pair into the store based on the supplied <code class="docutils literal notranslate"><span class="pre">key</span></code> and
<code class="docutils literal notranslate"><span class="pre">value</span></code>. If <code class="docutils literal notranslate"><span class="pre">key</span></code> already exists in the store, it will overwrite the old
value with the new supplied <code class="docutils literal notranslate"><span class="pre">value</span></code>.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>key</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a>) – The key to be added to the store.</p></li>
<li><p><strong>value</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a>) – The value associated with <code class="docutils literal notranslate"><span class="pre">key</span></code> to be added to the store.</p></li>
</ul>
</dd>
</dl>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">torch.distributed</span> <span class="k">as</span> <span class="nn">dist</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">timedelta</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span> <span class="o">=</span> <span class="n">dist</span><span class="o">.</span><span class="n">TCPStore</span><span class="p">(</span><span class="s2">&quot;127.0.0.1&quot;</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="kc">True</span><span class="p">,</span> <span class="n">timedelta</span><span class="p">(</span><span class="n">seconds</span><span class="o">=</span><span class="mi">30</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s2">&quot;first_key&quot;</span><span class="p">,</span> <span class="s2">&quot;first_value&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Should return &quot;first_value&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;first_key&quot;</span><span class="p">)</span>
</pre></div>
</div>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.Store.get">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.Store.</span></span><span class="sig-name descname"><span class="pre">get</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">self</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">torch._C._distributed_c10d.Store</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">arg0</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><span class="pre">str</span></a></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#bytes" title="(in Python v3.13)"><span class="pre">bytes</span></a></span></span><a class="headerlink" href="#torch.distributed.Store.get" title="Permalink to this definition">¶</a></dt>
<dd><p>Retrieves the value associated with the given <code class="docutils literal notranslate"><span class="pre">key</span></code> in the store. If <code class="docutils literal notranslate"><span class="pre">key</span></code> is not
present in the store, the function will wait for <code class="docutils literal notranslate"><span class="pre">timeout</span></code>, which is defined
when initializing the store, before throwing an exception.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>key</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a>) – The function will return the value associated with this key.</p>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>Value associated with <code class="docutils literal notranslate"><span class="pre">key</span></code> if <code class="docutils literal notranslate"><span class="pre">key</span></code> is in the store.</p>
</dd>
</dl>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">torch.distributed</span> <span class="k">as</span> <span class="nn">dist</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">timedelta</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span> <span class="o">=</span> <span class="n">dist</span><span class="o">.</span><span class="n">TCPStore</span><span class="p">(</span><span class="s2">&quot;127.0.0.1&quot;</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="kc">True</span><span class="p">,</span> <span class="n">timedelta</span><span class="p">(</span><span class="n">seconds</span><span class="o">=</span><span class="mi">30</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s2">&quot;first_key&quot;</span><span class="p">,</span> <span class="s2">&quot;first_value&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Should return &quot;first_value&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;first_key&quot;</span><span class="p">)</span>
</pre></div>
</div>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.Store.add">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.Store.</span></span><span class="sig-name descname"><span class="pre">add</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">self</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">torch._C._distributed_c10d.Store</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">arg0</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><span class="pre">str</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">arg1</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a></span></span><a class="headerlink" href="#torch.distributed.Store.add" title="Permalink to this definition">¶</a></dt>
<dd><p>The first call to add for a given <code class="docutils literal notranslate"><span class="pre">key</span></code> creates a counter associated
with <code class="docutils literal notranslate"><span class="pre">key</span></code> in the store, initialized to <code class="docutils literal notranslate"><span class="pre">amount</span></code>. Subsequent calls to add
with the same <code class="docutils literal notranslate"><span class="pre">key</span></code> increment the counter by the specified <code class="docutils literal notranslate"><span class="pre">amount</span></code>.
Calling <code class="xref py py-meth docutils literal notranslate"><span class="pre">add()</span></code> with a key that has already
been set in the store by <code class="xref py py-meth docutils literal notranslate"><span class="pre">set()</span></code> will result
in an exception.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>key</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a>) – The key in the store whose counter will be incremented.</p></li>
<li><p><strong>amount</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a>) – The quantity by which the counter will be incremented.</p></li>
</ul>
</dd>
</dl>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">torch.distributed</span> <span class="k">as</span> <span class="nn">dist</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">timedelta</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Using TCPStore as an example, other store types can also be used</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span> <span class="o">=</span> <span class="n">dist</span><span class="o">.</span><span class="n">TCPStore</span><span class="p">(</span><span class="s2">&quot;127.0.0.1&quot;</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="kc">True</span><span class="p">,</span> <span class="n">timedelta</span><span class="p">(</span><span class="n">seconds</span><span class="o">=</span><span class="mi">30</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s2">&quot;first_key&quot;</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s2">&quot;first_key&quot;</span><span class="p">,</span> <span class="mi">6</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Should return 7</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;first_key&quot;</span><span class="p">)</span>
</pre></div>
</div>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.Store.compare_set">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.Store.</span></span><span class="sig-name descname"><span class="pre">compare_set</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">self</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">torch._C._distributed_c10d.Store</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">arg0</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><span class="pre">str</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">arg1</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><span class="pre">str</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">arg2</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><span class="pre">str</span></a></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#bytes" title="(in Python v3.13)"><span class="pre">bytes</span></a></span></span><a class="headerlink" href="#torch.distributed.Store.compare_set" title="Permalink to this definition">¶</a></dt>
<dd><p>Inserts the key-value pair into the store based on the supplied <code class="docutils literal notranslate"><span class="pre">key</span></code> and
performs comparison between <code class="docutils literal notranslate"><span class="pre">expected_value</span></code> and <code class="docutils literal notranslate"><span class="pre">desired_value</span></code> before inserting. <code class="docutils literal notranslate"><span class="pre">desired_value</span></code>
will only be set if <code class="docutils literal notranslate"><span class="pre">expected_value</span></code> for the <code class="docutils literal notranslate"><span class="pre">key</span></code> already exists in the store or if <code class="docutils literal notranslate"><span class="pre">expected_value</span></code>
is an empty string.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>key</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a>) – The key to be checked in the store.</p></li>
<li><p><strong>expected_value</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a>) – The value associated with <code class="docutils literal notranslate"><span class="pre">key</span></code> to be checked before insertion.</p></li>
<li><p><strong>desired_value</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a>) – The value associated with <code class="docutils literal notranslate"><span class="pre">key</span></code> to be added to the store.</p></li>
</ul>
</dd>
</dl>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">torch.distributed</span> <span class="k">as</span> <span class="nn">dist</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">timedelta</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span> <span class="o">=</span> <span class="n">dist</span><span class="o">.</span><span class="n">TCPStore</span><span class="p">(</span><span class="s2">&quot;127.0.0.1&quot;</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="kc">True</span><span class="p">,</span> <span class="n">timedelta</span><span class="p">(</span><span class="n">seconds</span><span class="o">=</span><span class="mi">30</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s2">&quot;key&quot;</span><span class="p">,</span> <span class="s2">&quot;first_value&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span><span class="o">.</span><span class="n">compare_set</span><span class="p">(</span><span class="s2">&quot;key&quot;</span><span class="p">,</span> <span class="s2">&quot;first_value&quot;</span><span class="p">,</span> <span class="s2">&quot;second_value&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Should return &quot;second_value&quot;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;key&quot;</span><span class="p">)</span>
</pre></div>
</div>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.Store.wait">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.Store.</span></span><span class="sig-name descname"><span class="pre">wait</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#torch.distributed.Store.wait" title="Permalink to this definition">¶</a></dt>
<dd><p>Overloaded function.</p>
<ol class="arabic simple">
<li><p>wait(self: torch._C._distributed_c10d.Store, arg0: list[str]) -&gt; None</p></li>
</ol>
<p>Waits for each key in <code class="docutils literal notranslate"><span class="pre">keys</span></code> to be added to the store. If not all keys are
set before the <code class="docutils literal notranslate"><span class="pre">timeout</span></code> (set during store initialization), then <code class="docutils literal notranslate"><span class="pre">wait</span></code>
will throw an exception.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>keys</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a>) – List of keys on which to wait until they are set in the store.</p>
</dd>
</dl>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">torch.distributed</span> <span class="k">as</span> <span class="nn">dist</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">timedelta</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Using TCPStore as an example, other store types can also be used</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span> <span class="o">=</span> <span class="n">dist</span><span class="o">.</span><span class="n">TCPStore</span><span class="p">(</span><span class="s2">&quot;127.0.0.1&quot;</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="kc">True</span><span class="p">,</span> <span class="n">timedelta</span><span class="p">(</span><span class="n">seconds</span><span class="o">=</span><span class="mi">30</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># This will throw an exception after 30 seconds</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span><span class="o">.</span><span class="n">wait</span><span class="p">([</span><span class="s2">&quot;bad_key&quot;</span><span class="p">])</span>
</pre></div>
</div>
</dd>
</dl>
<ol class="arabic simple" start="2">
<li><p>wait(self: torch._C._distributed_c10d.Store, arg0: list[str], arg1: datetime.timedelta) -&gt; None</p></li>
</ol>
<p>Waits for each key in <code class="docutils literal notranslate"><span class="pre">keys</span></code> to be added to the store, and throws an exception
if the keys have not been set by the supplied <code class="docutils literal notranslate"><span class="pre">timeout</span></code>.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>keys</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a>) – List of keys on which to wait until they are set in the store.</p></li>
<li><p><strong>timeout</strong> (<em>timedelta</em>) – Time to wait for the keys to be added before throwing an exception.</p></li>
</ul>
</dd>
</dl>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">torch.distributed</span> <span class="k">as</span> <span class="nn">dist</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">timedelta</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Using TCPStore as an example, other store types can also be used</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span> <span class="o">=</span> <span class="n">dist</span><span class="o">.</span><span class="n">TCPStore</span><span class="p">(</span><span class="s2">&quot;127.0.0.1&quot;</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="kc">True</span><span class="p">,</span> <span class="n">timedelta</span><span class="p">(</span><span class="n">seconds</span><span class="o">=</span><span class="mi">30</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># This will throw an exception after 10 seconds</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span><span class="o">.</span><span class="n">wait</span><span class="p">([</span><span class="s2">&quot;bad_key&quot;</span><span class="p">],</span> <span class="n">timedelta</span><span class="p">(</span><span class="n">seconds</span><span class="o">=</span><span class="mi">10</span><span class="p">))</span>
</pre></div>
</div>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.Store.num_keys">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.Store.</span></span><span class="sig-name descname"><span class="pre">num_keys</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">self</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">torch._C._distributed_c10d.Store</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><span class="pre">int</span></a></span></span><a class="headerlink" href="#torch.distributed.Store.num_keys" title="Permalink to this definition">¶</a></dt>
<dd><p>Returns the number of keys set in the store. Note that this number will typically
be one greater than the number of keys added by <code class="xref py py-meth docutils literal notranslate"><span class="pre">set()</span></code>
and <code class="xref py py-meth docutils literal notranslate"><span class="pre">add()</span></code> since one key is used to coordinate all
the workers using the store.</p>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>When used with the <a class="reference internal" href="#torch.distributed.TCPStore" title="torch.distributed.TCPStore"><code class="xref py py-class docutils literal notranslate"><span class="pre">TCPStore</span></code></a>, <code class="docutils literal notranslate"><span class="pre">num_keys</span></code> returns the number of keys written to the underlying file. If the store is destructed and another store is created with the same file, the original keys will be retained.</p>
</div>
<dl class="field-list simple">
<dt class="field-odd">Returns</dt>
<dd class="field-odd"><p>The number of keys present in the store.</p>
</dd>
</dl>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">torch.distributed</span> <span class="k">as</span> <span class="nn">dist</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">timedelta</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Using TCPStore as an example, other store types can also be used</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span> <span class="o">=</span> <span class="n">dist</span><span class="o">.</span><span class="n">TCPStore</span><span class="p">(</span><span class="s2">&quot;127.0.0.1&quot;</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="kc">True</span><span class="p">,</span> <span class="n">timedelta</span><span class="p">(</span><span class="n">seconds</span><span class="o">=</span><span class="mi">30</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s2">&quot;first_key&quot;</span><span class="p">,</span> <span class="s2">&quot;first_value&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># This should return 2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span><span class="o">.</span><span class="n">num_keys</span><span class="p">()</span>
</pre></div>
</div>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.Store.delete_key">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.Store.</span></span><span class="sig-name descname"><span class="pre">delete_key</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">self</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">torch._C._distributed_c10d.Store</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">arg0</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><span class="pre">str</span></a></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><span class="pre">bool</span></a></span></span><a class="headerlink" href="#torch.distributed.Store.delete_key" title="Permalink to this definition">¶</a></dt>
<dd><p>Deletes the key-value pair associated with <code class="docutils literal notranslate"><span class="pre">key</span></code> from the store. Returns
<cite>true</cite> if the key was successfully deleted, and <cite>false</cite> if it was not.</p>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>The <code class="docutils literal notranslate"><span class="pre">delete_key</span></code> API is only supported by the <a class="reference internal" href="#torch.distributed.TCPStore" title="torch.distributed.TCPStore"><code class="xref py py-class docutils literal notranslate"><span class="pre">TCPStore</span></code></a> and <a class="reference internal" href="#torch.distributed.HashStore" title="torch.distributed.HashStore"><code class="xref py py-class docutils literal notranslate"><span class="pre">HashStore</span></code></a>. Using this API
with the <a class="reference internal" href="#torch.distributed.FileStore" title="torch.distributed.FileStore"><code class="xref py py-class docutils literal notranslate"><span class="pre">FileStore</span></code></a> will result in an exception.</p>
</div>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>key</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a>) – The key to be deleted from the store</p>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p><cite>True</cite> if <code class="docutils literal notranslate"><span class="pre">key</span></code> was deleted, otherwise <cite>False</cite>.</p>
</dd>
</dl>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">torch.distributed</span> <span class="k">as</span> <span class="nn">dist</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">timedelta</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Using TCPStore as an example, HashStore can also be used</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span> <span class="o">=</span> <span class="n">dist</span><span class="o">.</span><span class="n">TCPStore</span><span class="p">(</span><span class="s2">&quot;127.0.0.1&quot;</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="kc">True</span><span class="p">,</span> <span class="n">timedelta</span><span class="p">(</span><span class="n">seconds</span><span class="o">=</span><span class="mi">30</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="s2">&quot;first_key&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># This should return true</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span><span class="o">.</span><span class="n">delete_key</span><span class="p">(</span><span class="s2">&quot;first_key&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># This should return false</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span><span class="o">.</span><span class="n">delete_key</span><span class="p">(</span><span class="s2">&quot;bad_key&quot;</span><span class="p">)</span>
</pre></div>
</div>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.Store.set_timeout">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.Store.</span></span><span class="sig-name descname"><span class="pre">set_timeout</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">self</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">torch._C._distributed_c10d.Store</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">arg0</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/datetime.html#datetime.timedelta" title="(in Python v3.13)"><span class="pre">datetime.timedelta</span></a></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference external" href="https://docs.python.org/3/library/constants.html#None" title="(in Python v3.13)"><span class="pre">None</span></a></span></span><a class="headerlink" href="#torch.distributed.Store.set_timeout" title="Permalink to this definition">¶</a></dt>
<dd><p>Sets the store’s default timeout. This timeout is used during initialization and in
<code class="xref py py-meth docutils literal notranslate"><span class="pre">wait()</span></code> and <code class="xref py py-meth docutils literal notranslate"><span class="pre">get()</span></code>.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>timeout</strong> (<em>timedelta</em>) – timeout to be set in the store.</p>
</dd>
</dl>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">torch.distributed</span> <span class="k">as</span> <span class="nn">dist</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">timedelta</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Using TCPStore as an example, other store types can also be used</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span> <span class="o">=</span> <span class="n">dist</span><span class="o">.</span><span class="n">TCPStore</span><span class="p">(</span><span class="s2">&quot;127.0.0.1&quot;</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="kc">True</span><span class="p">,</span> <span class="n">timedelta</span><span class="p">(</span><span class="n">seconds</span><span class="o">=</span><span class="mi">30</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span><span class="o">.</span><span class="n">set_timeout</span><span class="p">(</span><span class="n">timedelta</span><span class="p">(</span><span class="n">seconds</span><span class="o">=</span><span class="mi">10</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># This will throw an exception after 10 seconds</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">store</span><span class="o">.</span><span class="n">wait</span><span class="p">([</span><span class="s2">&quot;bad_key&quot;</span><span class="p">])</span>
</pre></div>
</div>
</dd>
</dl>
</dd></dl>

</div>
<div class="section" id="groups">
<h2>Groups<a class="headerlink" href="#groups" title="Permalink to this heading">¶</a></h2>
<p>By default collectives operate on the default group (also called the world) and
require all processes to enter the distributed function call. However, some workloads can benefit
from more fine-grained communication. This is where distributed groups come
into play. <a class="reference internal" href="#torch.distributed.new_group" title="torch.distributed.new_group"><code class="xref py py-func docutils literal notranslate"><span class="pre">new_group()</span></code></a> function can be
used to create new groups, with arbitrary subsets of all processes. It returns
an opaque group handle that can be given as a <code class="docutils literal notranslate"><span class="pre">group</span></code> argument to all collectives
(collectives are distributed functions to exchange information in certain well-known programming patterns).</p>
<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.new_group">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">new_group</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">ranks</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">timeout</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">backend</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pg_options</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">use_local_synchronization</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">group_desc</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#new_group"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.new_group" title="Permalink to this definition">¶</a></dt>
<dd><p>Create a new distributed group.</p>
<p>This function requires that all processes in the main group (i.e. all
processes that are part of the distributed job) enter this function, even
if they are not going to be members of the group. Additionally, groups
should be created in the same order in all processes.</p>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>Safe concurrent usage:
When using multiple process groups with the <code class="docutils literal notranslate"><span class="pre">NCCL</span></code> backend, the user
must ensure a globally consistent execution order of collectives across
ranks.</p>
<p>If multiple threads within a process issue collectives, explicit
synchronization is necessary to ensure consistent ordering.</p>
<p>When using async variants of torch.distributed communication APIs,
a work object is returned and the communication kernel is
enqueued on a separate CUDA stream, allowing overlap of communication
and computation. Once one or more async ops have been issued on one process
group, they must be synchronized with other cuda streams by calling <cite>work.wait()</cite>
before using another process group.</p>
<p>See <a class="reference external" href="https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/communicators.html#using-multiple-nccl-communicators-concurrently">Using multiple NCCL communicators concurrently</a> for more details.</p>
</div>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>ranks</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>]</em>) – List of ranks of group members. If <code class="docutils literal notranslate"><span class="pre">None</span></code>, will be
set to all ranks. Default is <code class="docutils literal notranslate"><span class="pre">None</span></code>.</p></li>
<li><p><strong>timeout</strong> (<em>timedelta</em><em>, </em><em>optional</em>) – see <cite>init_process_group</cite> for details and default value.</p></li>
<li><p><strong>backend</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em> or </em><a class="reference internal" href="#torch.distributed.Backend" title="torch.distributed.Backend"><em>Backend</em></a><em>, </em><em>optional</em>) – The backend to use. Depending on
build-time configurations, valid values are <code class="docutils literal notranslate"><span class="pre">gloo</span></code> and <code class="docutils literal notranslate"><span class="pre">nccl</span></code>.
By default uses the same backend as the global group. This field
should be given as a lowercase string (e.g., <code class="docutils literal notranslate"><span class="pre">&quot;gloo&quot;</span></code>), which can
also be accessed via <a class="reference internal" href="#torch.distributed.Backend" title="torch.distributed.Backend"><code class="xref py py-class docutils literal notranslate"><span class="pre">Backend</span></code></a> attributes (e.g.,
<code class="docutils literal notranslate"><span class="pre">Backend.GLOO</span></code>). If <code class="docutils literal notranslate"><span class="pre">None</span></code> is passed in, the backend
corresponding to the default process group will be used. Default is
<code class="docutils literal notranslate"><span class="pre">None</span></code>.</p></li>
<li><p><strong>pg_options</strong> (<em>ProcessGroupOptions</em><em>, </em><em>optional</em>) – process group options
specifying what additional options need to be passed in during
the construction of specific process groups. i.e. for the <code class="docutils literal notranslate"><span class="pre">nccl</span></code>
backend, <code class="docutils literal notranslate"><span class="pre">is_high_priority_stream</span></code> can be specified so that
process group can pick up high priority cuda streams. For other availble options to config nccl,
See <a class="reference external" href="https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/types.html#ncclconfig-t">https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/types.html#ncclconfig-t</a></p></li>
<li><p><strong>use_local_synchronization</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>optional</em>) – perform a group-local
barrier at the end of the process group creation. This is different
in that non-member ranks don’t need to call into API and don’t
join the barrier.</p></li>
<li><p><strong>group_desc</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a><em>, </em><em>optional</em>) – a string to describe the process group.</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>A handle of distributed group that can be given to collective calls or
GroupMember.NON_GROUP_MEMBER if the rank is not part of <code class="docutils literal notranslate"><span class="pre">ranks</span></code>.</p>
</dd>
</dl>
<p>N.B. use_local_synchronization doesn’t work with MPI.</p>
<p>N.B. While use_local_synchronization=True can be significantly faster with larger
clusters and small process groups, care must be taken since it changes cluster behavior
as non-member ranks don’t join the group barrier().</p>
<p>N.B. use_local_synchronization=True can lead to deadlocks when each rank creates
multiple overlaping process groups. To avoid that, make sure all ranks follow the
same global creation order.</p>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.get_group_rank">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">get_group_rank</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">group</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">global_rank</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#get_group_rank"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.get_group_rank" title="Permalink to this definition">¶</a></dt>
<dd><p>Translate a global rank into a group rank.</p>
<p><code class="docutils literal notranslate"><span class="pre">global_rank</span></code> must be part of <code class="docutils literal notranslate"><span class="pre">group</span></code> otherwise this raises RuntimeError.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>group</strong> (<em>ProcessGroup</em>) – ProcessGroup to find the relative rank.</p></li>
<li><p><strong>global_rank</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a>) – Global rank to query.</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>Group rank of <code class="docutils literal notranslate"><span class="pre">global_rank</span></code> relative to <code class="docutils literal notranslate"><span class="pre">group</span></code></p>
</dd>
<dt class="field-odd">Return type</dt>
<dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)">int</a></p>
</dd>
</dl>
<p>N.B. calling this function on the default process group returns identity</p>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.get_global_rank">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">get_global_rank</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">group</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">group_rank</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#get_global_rank"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.get_global_rank" title="Permalink to this definition">¶</a></dt>
<dd><p>Translate a group rank into a global rank.</p>
<p><code class="docutils literal notranslate"><span class="pre">group_rank</span></code> must be part of <cite>group</cite> otherwise this raises RuntimeError.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>group</strong> (<em>ProcessGroup</em>) – ProcessGroup to find the global rank from.</p></li>
<li><p><strong>group_rank</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a>) – Group rank to query.</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>Global rank of <code class="docutils literal notranslate"><span class="pre">group_rank</span></code> relative to <code class="docutils literal notranslate"><span class="pre">group</span></code></p>
</dd>
<dt class="field-odd">Return type</dt>
<dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)">int</a></p>
</dd>
</dl>
<p>N.B. calling this function on the default process group returns identity</p>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.get_process_group_ranks">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">get_process_group_ranks</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">group</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#get_process_group_ranks"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.get_process_group_ranks" title="Permalink to this definition">¶</a></dt>
<dd><p>Get all ranks associated with <code class="docutils literal notranslate"><span class="pre">group</span></code>.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>group</strong> (<em>ProcessGroup</em>) – ProcessGroup to get all ranks from.</p>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>List of global ranks ordered by group rank.</p>
</dd>
<dt class="field-odd">Return type</dt>
<dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.List" title="(in Python v3.13)"><em>List</em></a>[<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)">int</a>]</p>
</dd>
</dl>
</dd></dl>

</div>
<div class="section" id="devicemesh">
<h2>DeviceMesh<a class="headerlink" href="#devicemesh" title="Permalink to this heading">¶</a></h2>
<p>DeviceMesh is a higher level abstraction that manages process groups (or NCCL communicators).
It allows user to easily create inter node and intra node process groups without worrying about
how to set up the ranks correctly for different sub process groups, and it helps manage those
distributed process group easily. <a class="reference internal" href="#torch.distributed.device_mesh.init_device_mesh" title="torch.distributed.device_mesh.init_device_mesh"><code class="xref py py-func docutils literal notranslate"><span class="pre">init_device_mesh()</span></code></a> function can be
used to create new DeviceMesh, with a mesh shape describing the device topology.</p>
<dl class="py class">
<dt class="sig sig-object py" id="torch.distributed.device_mesh.DeviceMesh">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">torch.distributed.device_mesh.</span></span><span class="sig-name descname"><span class="pre">DeviceMesh</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">device_type</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">mesh</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">mesh_dim_names</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">_init_backend</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/device_mesh.html#DeviceMesh"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.device_mesh.DeviceMesh" title="Permalink to this definition">¶</a></dt>
<dd><p>DeviceMesh represents a mesh of devices, where layout of devices could be
represented as a n-d dimension array, and each value of the n-d dimensional
array is the global id of the default process group ranks.</p>
<p>DeviceMesh could be used to describe the layout of devices across the cluster,
and serves as a proxy for communication among the device lists within the cluster.</p>
<p>DeviceMesh can be used as a context manager.</p>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>DeviceMesh follows SPMD programming model, which means the same PyTorch Python program
is running on all processes/ranks in the cluster. Therefore, users need to make sure the
<cite>mesh</cite> array (which describes the layout of devices) should be identical across all ranks.
Inconsistent <cite>mesh</cite> will lead to silent hang.</p>
</div>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>device_type</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><em>str</em></a>) – The device type of the mesh. Currently supports: “cpu”, “cuda/cuda-like”.</p></li>
<li><p><strong>mesh</strong> (<em>ndarray</em>) – A multi-dimensional array or an integer tensor describing the layout
of devices, where the IDs are global IDs of the default process group.</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>A <a class="reference internal" href="#torch.distributed.device_mesh.DeviceMesh" title="torch.distributed.device_mesh.DeviceMesh"><code class="xref py py-class docutils literal notranslate"><span class="pre">DeviceMesh</span></code></a> object representing the device layout.</p>
</dd>
<dt class="field-odd">Return type</dt>
<dd class="field-odd"><p><a class="reference internal" href="#torch.distributed.device_mesh.DeviceMesh" title="torch.distributed.device_mesh.DeviceMesh">DeviceMesh</a></p>
</dd>
</dl>
<p>The following program runs on each process/rank in an SPMD manner. In this example, we have 2
hosts with 4 GPUs each.
A reduction over the first dimension of mesh will reduce across
columns (0, 4), .. and (3, 7), a reduction over the second dimension
of mesh reduces across rows (0, 1, 2, 3) and (4, 5, 6, 7).</p>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">from</span> <span class="nn">torch.distributed.device_mesh</span> <span class="kn">import</span> <span class="n">DeviceMesh</span>
<span class="go">&gt;&gt;&gt;</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Initialize device mesh as (2, 4) to represent the topology</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># of cross-host(dim 0), and within-host (dim 1).</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">mesh</span> <span class="o">=</span> <span class="n">DeviceMesh</span><span class="p">(</span><span class="n">device_type</span><span class="o">=</span><span class="s2">&quot;cuda&quot;</span><span class="p">,</span> <span class="n">mesh</span><span class="o">=</span><span class="p">[[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">],[</span><span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">,</span> <span class="mi">7</span><span class="p">]])</span>
</pre></div>
</div>
</dd>
</dl>
</dd></dl>

</div>
<div class="section" id="point-to-point-communication">
<h2>Point-to-point communication<a class="headerlink" href="#point-to-point-communication" title="Permalink to this heading">¶</a></h2>
<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.send">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">send</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">tensor</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dst</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">group</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tag</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#send"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.send" title="Permalink to this definition">¶</a></dt>
<dd><p>Send a tensor synchronously.</p>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p><code class="docutils literal notranslate"><span class="pre">tag</span></code> is not supported with the NCCL backend.</p>
</div>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>tensor</strong> (<a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a>) – Tensor to send.</p></li>
<li><p><strong>dst</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a>) – Destination rank on global process group (regardless of <code class="docutils literal notranslate"><span class="pre">group</span></code> argument).
Destination rank should not be the same as the rank of the current process.</p></li>
<li><p><strong>group</strong> (<em>ProcessGroup</em><em>, </em><em>optional</em>) – The process group to work on. If None,
the default process group will be used.</p></li>
<li><p><strong>tag</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>optional</em>) – Tag to match send with remote recv</p></li>
</ul>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.recv">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">recv</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">tensor</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">group</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tag</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#recv"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.recv" title="Permalink to this definition">¶</a></dt>
<dd><p>Receives a tensor synchronously.</p>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p><code class="docutils literal notranslate"><span class="pre">tag</span></code> is not supported with the NCCL backend.</p>
</div>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>tensor</strong> (<a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a>) – Tensor to fill with received data.</p></li>
<li><p><strong>src</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>optional</em>) – Source rank on global process group (regardless of <code class="docutils literal notranslate"><span class="pre">group</span></code> argument).
Will receive from any process if unspecified.</p></li>
<li><p><strong>group</strong> (<em>ProcessGroup</em><em>, </em><em>optional</em>) – The process group to work on. If None,
the default process group will be used.</p></li>
<li><p><strong>tag</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>optional</em>) – Tag to match recv with remote send</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>Sender rank
-1, if not part of the group</p>
</dd>
<dt class="field-odd">Return type</dt>
<dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)">int</a></p>
</dd>
</dl>
</dd></dl>

<p><a class="reference internal" href="#torch.distributed.isend" title="torch.distributed.isend"><code class="xref py py-func docutils literal notranslate"><span class="pre">isend()</span></code></a> and <a class="reference internal" href="#torch.distributed.irecv" title="torch.distributed.irecv"><code class="xref py py-func docutils literal notranslate"><span class="pre">irecv()</span></code></a>
return distributed request objects when used. In general, the type of this object is unspecified
as they should never be created manually, but they are guaranteed to support two methods:</p>
<ul class="simple">
<li><p><code class="docutils literal notranslate"><span class="pre">is_completed()</span></code> - returns True if the operation has finished</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">wait()</span></code> - will block the process until the operation is finished.
<code class="docutils literal notranslate"><span class="pre">is_completed()</span></code> is guaranteed to return True once it returns.</p></li>
</ul>
<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.isend">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">isend</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">tensor</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dst</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">group</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tag</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#isend"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.isend" title="Permalink to this definition">¶</a></dt>
<dd><p>Send a tensor asynchronously.</p>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>Modifying <code class="docutils literal notranslate"><span class="pre">tensor</span></code> before the request completes causes undefined
behavior.</p>
</div>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p><code class="docutils literal notranslate"><span class="pre">tag</span></code> is not supported with the NCCL backend.</p>
</div>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>tensor</strong> (<a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a>) – Tensor to send.</p></li>
<li><p><strong>dst</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a>) – Destination rank on global process group (regardless of <code class="docutils literal notranslate"><span class="pre">group</span></code> argument)</p></li>
<li><p><strong>group</strong> (<em>ProcessGroup</em><em>, </em><em>optional</em>) – The process group to work on. If None,
the default process group will be used.</p></li>
<li><p><strong>tag</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>optional</em>) – Tag to match send with remote recv</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>A distributed request object.
None, if not part of the group</p>
</dd>
<dt class="field-odd">Return type</dt>
<dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Optional" title="(in Python v3.13)"><em>Optional</em></a>[<a class="reference internal" href="#torch.distributed.Work" title="torch.distributed.distributed_c10d.Work"><em>Work</em></a>]</p>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.irecv">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">irecv</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">tensor</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">group</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tag</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#irecv"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.irecv" title="Permalink to this definition">¶</a></dt>
<dd><p>Receives a tensor asynchronously.</p>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p><code class="docutils literal notranslate"><span class="pre">tag</span></code> is not supported with the NCCL backend.</p>
</div>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>tensor</strong> (<a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a>) – Tensor to fill with received data.</p></li>
<li><p><strong>src</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>optional</em>) – Source rank on global process group (regardless of <code class="docutils literal notranslate"><span class="pre">group</span></code> argument).
Will receive from any process if unspecified.</p></li>
<li><p><strong>group</strong> (<em>ProcessGroup</em><em>, </em><em>optional</em>) – The process group to work on. If None,
the default process group will be used.</p></li>
<li><p><strong>tag</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>optional</em>) – Tag to match recv with remote send</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>A distributed request object.
None, if not part of the group</p>
</dd>
<dt class="field-odd">Return type</dt>
<dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Optional" title="(in Python v3.13)"><em>Optional</em></a>[<a class="reference internal" href="#torch.distributed.Work" title="torch.distributed.distributed_c10d.Work"><em>Work</em></a>]</p>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.send_object_list">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">send_object_list</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">object_list</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dst</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">group</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#send_object_list"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.send_object_list" title="Permalink to this definition">¶</a></dt>
<dd><p>Sends picklable objects in <code class="docutils literal notranslate"><span class="pre">object_list</span></code> synchronously.</p>
<p>Similar to <a class="reference internal" href="#torch.distributed.send" title="torch.distributed.send"><code class="xref py py-func docutils literal notranslate"><span class="pre">send()</span></code></a>, but Python objects can be passed in.
Note that all objects in <code class="docutils literal notranslate"><span class="pre">object_list</span></code> must be picklable in order to be
sent.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>object_list</strong> (<em>List</em><em>[</em><em>Any</em><em>]</em>) – List of input objects to sent.
Each object must be picklable. Receiver must provide lists of equal sizes.</p></li>
<li><p><strong>dst</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a>) – Destination rank to send <code class="docutils literal notranslate"><span class="pre">object_list</span></code> to.
Destination rank is based on global process group (regardless of <code class="docutils literal notranslate"><span class="pre">group</span></code> argument)</p></li>
<li><p><strong>group</strong> – (ProcessGroup, optional): The process group to work on. If None,
the default process group will be used. Default is <code class="docutils literal notranslate"><span class="pre">None</span></code>.</p></li>
<li><p><strong>device</strong> (<code class="docutils literal notranslate"><span class="pre">torch.device</span></code>, optional) – If not None, the objects are
serialized and converted to tensors which are moved to the
<code class="docutils literal notranslate"><span class="pre">device</span></code> before sending. Default is <code class="docutils literal notranslate"><span class="pre">None</span></code>.</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p><code class="docutils literal notranslate"><span class="pre">None</span></code>.</p>
</dd>
</dl>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>For NCCL-based process groups, internal tensor representations
of objects must be moved to the GPU device before communication takes
place. In this case, the device used is given by
<code class="docutils literal notranslate"><span class="pre">torch.cuda.current_device()</span></code> and it is the user’s responsibility to
ensure that this is set so that each rank has an individual GPU, via
<code class="docutils literal notranslate"><span class="pre">torch.cuda.set_device()</span></code>.</p>
</div>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p><a class="reference internal" href="#torch.distributed.send_object_list" title="torch.distributed.send_object_list"><code class="xref py py-func docutils literal notranslate"><span class="pre">send_object_list()</span></code></a> uses <code class="docutils literal notranslate"><span class="pre">pickle</span></code> module implicitly, which
is known to be insecure. It is possible to construct malicious pickle
data which will execute arbitrary code during unpickling. Only call this
function with data you trust.</p>
</div>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>Calling <a class="reference internal" href="#torch.distributed.send_object_list" title="torch.distributed.send_object_list"><code class="xref py py-func docutils literal notranslate"><span class="pre">send_object_list()</span></code></a> with GPU tensors is not well supported
and inefficient as it incurs GPU -&gt; CPU transfer since tensors would be
pickled. Please consider using <a class="reference internal" href="#torch.distributed.send" title="torch.distributed.send"><code class="xref py py-func docutils literal notranslate"><span class="pre">send()</span></code></a> instead.</p>
</div>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># Note: Process group initialization omitted on each rank.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">torch.distributed</span> <span class="k">as</span> <span class="nn">dist</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Assumes backend is not NCCL</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">device</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">device</span><span class="p">(</span><span class="s2">&quot;cpu&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">if</span> <span class="n">dist</span><span class="o">.</span><span class="n">get_rank</span><span class="p">()</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="gp">&gt;&gt;&gt; </span>    <span class="c1"># Assumes world_size of 2.</span>
<span class="gp">&gt;&gt;&gt; </span>    <span class="n">objects</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;foo&quot;</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mi">2</span><span class="p">}]</span> <span class="c1"># any picklable object</span>
<span class="gp">&gt;&gt;&gt; </span>    <span class="n">dist</span><span class="o">.</span><span class="n">send_object_list</span><span class="p">(</span><span class="n">objects</span><span class="p">,</span> <span class="n">dst</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">else</span><span class="p">:</span>
<span class="gp">&gt;&gt;&gt; </span>    <span class="n">objects</span> <span class="o">=</span> <span class="p">[</span><span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span>    <span class="n">dist</span><span class="o">.</span><span class="n">recv_object_list</span><span class="p">(</span><span class="n">objects</span><span class="p">,</span> <span class="n">src</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">objects</span>
<span class="go">[&#39;foo&#39;, 12, {1: 2}]</span>
</pre></div>
</div>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.recv_object_list">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">recv_object_list</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">object_list</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">group</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#recv_object_list"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.recv_object_list" title="Permalink to this definition">¶</a></dt>
<dd><p>Receives picklable objects in <code class="docutils literal notranslate"><span class="pre">object_list</span></code> synchronously.</p>
<p>Similar to <a class="reference internal" href="#torch.distributed.recv" title="torch.distributed.recv"><code class="xref py py-func docutils literal notranslate"><span class="pre">recv()</span></code></a>, but can receive Python objects.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>object_list</strong> (<em>List</em><em>[</em><em>Any</em><em>]</em>) – List of objects to receive into.
Must provide a list of sizes equal to the size of the list being sent.</p></li>
<li><p><strong>src</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>optional</em>) – Source rank from which to recv <code class="docutils literal notranslate"><span class="pre">object_list</span></code>.
Source rank is based on global process group (regardless of <code class="docutils literal notranslate"><span class="pre">group</span></code> argument)
Will receive from any rank if set to None. Default is <code class="docutils literal notranslate"><span class="pre">None</span></code>.</p></li>
<li><p><strong>group</strong> – (ProcessGroup, optional): The process group to work on. If None,
the default process group will be used. Default is <code class="docutils literal notranslate"><span class="pre">None</span></code>.</p></li>
<li><p><strong>device</strong> (<code class="docutils literal notranslate"><span class="pre">torch.device</span></code>, optional) – If not None, receives on this device.
Default is <code class="docutils literal notranslate"><span class="pre">None</span></code>.</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>Sender rank. -1 if rank is not part of the group. If rank is part of the group,
<code class="docutils literal notranslate"><span class="pre">object_list</span></code> will contain the sent objects from <code class="docutils literal notranslate"><span class="pre">src</span></code> rank.</p>
</dd>
</dl>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>For NCCL-based process groups, internal tensor representations
of objects must be moved to the GPU device before communication takes
place. In this case, the device used is given by
<code class="docutils literal notranslate"><span class="pre">torch.cuda.current_device()</span></code> and it is the user’s responsibility to
ensure that this is set so that each rank has an individual GPU, via
<code class="docutils literal notranslate"><span class="pre">torch.cuda.set_device()</span></code>.</p>
</div>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p><a class="reference internal" href="#torch.distributed.recv_object_list" title="torch.distributed.recv_object_list"><code class="xref py py-func docutils literal notranslate"><span class="pre">recv_object_list()</span></code></a> uses <code class="docutils literal notranslate"><span class="pre">pickle</span></code> module implicitly, which
is known to be insecure. It is possible to construct malicious pickle
data which will execute arbitrary code during unpickling. Only call this
function with data you trust.</p>
</div>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>Calling <a class="reference internal" href="#torch.distributed.recv_object_list" title="torch.distributed.recv_object_list"><code class="xref py py-func docutils literal notranslate"><span class="pre">recv_object_list()</span></code></a> with GPU tensors is not well supported
and inefficient as it incurs GPU -&gt; CPU transfer since tensors would be
pickled. Please consider using <a class="reference internal" href="#torch.distributed.recv" title="torch.distributed.recv"><code class="xref py py-func docutils literal notranslate"><span class="pre">recv()</span></code></a> instead.</p>
</div>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># Note: Process group initialization omitted on each rank.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">torch.distributed</span> <span class="k">as</span> <span class="nn">dist</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Assumes backend is not NCCL</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">device</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">device</span><span class="p">(</span><span class="s2">&quot;cpu&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">if</span> <span class="n">dist</span><span class="o">.</span><span class="n">get_rank</span><span class="p">()</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="gp">&gt;&gt;&gt; </span>    <span class="c1"># Assumes world_size of 2.</span>
<span class="gp">&gt;&gt;&gt; </span>    <span class="n">objects</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;foo&quot;</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mi">2</span><span class="p">}]</span> <span class="c1"># any picklable object</span>
<span class="gp">&gt;&gt;&gt; </span>    <span class="n">dist</span><span class="o">.</span><span class="n">send_object_list</span><span class="p">(</span><span class="n">objects</span><span class="p">,</span> <span class="n">dst</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">else</span><span class="p">:</span>
<span class="gp">&gt;&gt;&gt; </span>    <span class="n">objects</span> <span class="o">=</span> <span class="p">[</span><span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span>    <span class="n">dist</span><span class="o">.</span><span class="n">recv_object_list</span><span class="p">(</span><span class="n">objects</span><span class="p">,</span> <span class="n">src</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">objects</span>
<span class="go">[&#39;foo&#39;, 12, {1: 2}]</span>
</pre></div>
</div>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.batch_isend_irecv">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">batch_isend_irecv</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">p2p_op_list</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#batch_isend_irecv"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.batch_isend_irecv" title="Permalink to this definition">¶</a></dt>
<dd><p>Send or Receive a batch of tensors asynchronously and return a list of requests.</p>
<p>Process each of the operations in <code class="docutils literal notranslate"><span class="pre">p2p_op_list</span></code> and return the corresponding
requests. NCCL, Gloo, and UCC backend are currently supported.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>p2p_op_list</strong> – A list of point-to-point operations(type of each operator is
<code class="docutils literal notranslate"><span class="pre">torch.distributed.P2POp</span></code>). The order of the isend/irecv in the list
matters and it needs to match with corresponding isend/irecv on the
remote end.</p>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>A list of distributed request objects returned by calling the corresponding
op in the op_list.</p>
</dd>
</dl>
<p class="rubric">Examples</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">send_tensor</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span> <span class="o">+</span> <span class="mi">2</span> <span class="o">*</span> <span class="n">rank</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">recv_tensor</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">send_op</span> <span class="o">=</span> <span class="n">dist</span><span class="o">.</span><span class="n">P2POp</span><span class="p">(</span><span class="n">dist</span><span class="o">.</span><span class="n">isend</span><span class="p">,</span> <span class="n">send_tensor</span><span class="p">,</span> <span class="p">(</span><span class="n">rank</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span><span class="o">%</span><span class="n">world_size</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">recv_op</span> <span class="o">=</span> <span class="n">dist</span><span class="o">.</span><span class="n">P2POp</span><span class="p">(</span><span class="n">dist</span><span class="o">.</span><span class="n">irecv</span><span class="p">,</span> <span class="n">recv_tensor</span><span class="p">,</span> <span class="p">(</span><span class="n">rank</span> <span class="o">-</span> <span class="mi">1</span> <span class="o">+</span> <span class="n">world_size</span><span class="p">)</span><span class="o">%</span><span class="n">world_size</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">reqs</span> <span class="o">=</span> <span class="n">batch_isend_irecv</span><span class="p">([</span><span class="n">send_op</span><span class="p">,</span> <span class="n">recv_op</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">for</span> <span class="n">req</span> <span class="ow">in</span> <span class="n">reqs</span><span class="p">:</span>
<span class="gp">&gt;&gt;&gt; </span>    <span class="n">req</span><span class="o">.</span><span class="n">wait</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">recv_tensor</span>
<span class="go">tensor([2, 3])     # Rank 0</span>
<span class="go">tensor([0, 1])     # Rank 1</span>
</pre></div>
</div>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>Note that when this API is used with the NCCL PG backend, users must set
the current GPU device with <cite>torch.cuda.set_device</cite>, otherwise it will
lead to unexpected hang issues.</p>
<p>In addition, if this API is the first collective call in the <code class="docutils literal notranslate"><span class="pre">group</span></code>
passed to <code class="docutils literal notranslate"><span class="pre">dist.P2POp</span></code>, all ranks of the <code class="docutils literal notranslate"><span class="pre">group</span></code> must participate in
this API call; otherwise, the behavior is undefined. If this API call is
not the first collective call in the <code class="docutils literal notranslate"><span class="pre">group</span></code>, batched P2P operations
involving only a subset of ranks of the <code class="docutils literal notranslate"><span class="pre">group</span></code> are allowed.</p>
</div>
</dd></dl>

<dl class="py class">
<dt class="sig sig-object py" id="torch.distributed.P2POp">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">P2POp</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">op</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tensor</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">peer</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">group</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tag</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#P2POp"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.P2POp" title="Permalink to this definition">¶</a></dt>
<dd><p>A class to build point-to-point operations for <code class="docutils literal notranslate"><span class="pre">batch_isend_irecv</span></code>.</p>
<p>This class builds the type of P2P operation, communication buffer, peer rank,
Process Group, and tag. Instances of this class will be passed to
<code class="docutils literal notranslate"><span class="pre">batch_isend_irecv</span></code> for point-to-point communications.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>op</strong> (<em>Callable</em>) – A function to send data to or receive data from a peer process.
The type of <code class="docutils literal notranslate"><span class="pre">op</span></code> is either <code class="docutils literal notranslate"><span class="pre">torch.distributed.isend</span></code> or
<code class="docutils literal notranslate"><span class="pre">torch.distributed.irecv</span></code>.</p></li>
<li><p><strong>tensor</strong> (<a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a>) – Tensor to send or receive.</p></li>
<li><p><strong>peer</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a>) – Destination or source rank.</p></li>
<li><p><strong>group</strong> (<em>ProcessGroup</em><em>, </em><em>optional</em>) – The process group to work on. If None,
the default process group will be used.</p></li>
<li><p><strong>tag</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>optional</em>) – Tag to match send with recv.</p></li>
</ul>
</dd>
</dl>
</dd></dl>

</div>
<div class="section" id="synchronous-and-asynchronous-collective-operations">
<h2>Synchronous and asynchronous collective operations<a class="headerlink" href="#synchronous-and-asynchronous-collective-operations" title="Permalink to this heading">¶</a></h2>
<p>Every collective operation function supports the following two kinds of operations,
depending on the setting of the <code class="docutils literal notranslate"><span class="pre">async_op</span></code> flag passed into the collective:</p>
<p><strong>Synchronous operation</strong> - the default mode, when <code class="docutils literal notranslate"><span class="pre">async_op</span></code> is set to <code class="docutils literal notranslate"><span class="pre">False</span></code>.
When the function returns, it is guaranteed that
the collective operation is performed. In the case of CUDA operations, it is not guaranteed
that the CUDA operation is completed, since CUDA operations are asynchronous. For CPU collectives, any
further function calls utilizing the output of the collective call will behave as expected. For CUDA collectives,
function calls utilizing the output on the same CUDA stream will behave as expected. Users must take care of
synchronization under the scenario of running under different streams. For details on CUDA semantics such as stream
synchronization, see <a class="reference external" href="https://pytorch.org/docs/stable/notes/cuda.html">CUDA Semantics</a>.
See the below script to see examples of differences in these semantics for CPU and CUDA operations.</p>
<p><strong>Asynchronous operation</strong> - when <code class="docutils literal notranslate"><span class="pre">async_op</span></code> is set to True. The collective operation function
returns a distributed request object. In general, you don’t need to create it manually and it
is guaranteed to support two methods:</p>
<ul class="simple">
<li><p><code class="docutils literal notranslate"><span class="pre">is_completed()</span></code> - in the case of CPU collectives, returns <code class="docutils literal notranslate"><span class="pre">True</span></code> if completed. In the case of CUDA operations,
returns <code class="docutils literal notranslate"><span class="pre">True</span></code> if the operation has been successfully enqueued onto a CUDA stream and the output can be utilized on the
default stream without further synchronization.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">wait()</span></code> - in the case of CPU collectives, will block the process until the operation is completed. In the case
of CUDA collectives, will block until the operation has been successfully enqueued onto a CUDA stream and the
output can be utilized on the default stream without further synchronization.</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">get_future()</span></code> - returns <code class="docutils literal notranslate"><span class="pre">torch._C.Future</span></code> object. Supported for NCCL, also supported for most operations on GLOO
and MPI, except for peer to peer operations.
Note: as we continue adopting Futures and merging APIs, <code class="docutils literal notranslate"><span class="pre">get_future()</span></code> call might become redundant.</p></li>
</ul>
<p><strong>Example</strong></p>
<p>The following code can serve as a reference regarding semantics for CUDA operations when using distributed collectives.
It shows the explicit need to synchronize when using collective outputs on different CUDA streams:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="c1"># Code runs on each rank.</span>
<span class="n">dist</span><span class="o">.</span><span class="n">init_process_group</span><span class="p">(</span><span class="s2">&quot;nccl&quot;</span><span class="p">,</span> <span class="n">rank</span><span class="o">=</span><span class="n">rank</span><span class="p">,</span> <span class="n">world_size</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="n">output</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">tensor</span><span class="p">([</span><span class="n">rank</span><span class="p">])</span><span class="o">.</span><span class="n">cuda</span><span class="p">(</span><span class="n">rank</span><span class="p">)</span>
<span class="n">s</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">Stream</span><span class="p">()</span>
<span class="n">handle</span> <span class="o">=</span> <span class="n">dist</span><span class="o">.</span><span class="n">all_reduce</span><span class="p">(</span><span class="n">output</span><span class="p">,</span> <span class="n">async_op</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="c1"># Wait ensures the operation is enqueued, but not necessarily complete.</span>
<span class="n">handle</span><span class="o">.</span><span class="n">wait</span><span class="p">()</span>
<span class="c1"># Using result on non-default stream.</span>
<span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">stream</span><span class="p">(</span><span class="n">s</span><span class="p">):</span>
    <span class="n">s</span><span class="o">.</span><span class="n">wait_stream</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">default_stream</span><span class="p">())</span>
    <span class="n">output</span><span class="o">.</span><span class="n">add_</span><span class="p">(</span><span class="mi">100</span><span class="p">)</span>
<span class="k">if</span> <span class="n">rank</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
    <span class="c1"># if the explicit call to wait_stream was omitted, the output below will be</span>
    <span class="c1"># non-deterministically 1 or 101, depending on whether the allreduce overwrote</span>
    <span class="c1"># the value after the add completed.</span>
    <span class="nb">print</span><span class="p">(</span><span class="n">output</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="section" id="collective-functions">
<h2>Collective functions<a class="headerlink" href="#collective-functions" title="Permalink to this heading">¶</a></h2>
<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.broadcast">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">broadcast</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">tensor</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">group</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">async_op</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#broadcast"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.broadcast" title="Permalink to this definition">¶</a></dt>
<dd><p>Broadcasts the tensor to the whole group.</p>
<p><code class="docutils literal notranslate"><span class="pre">tensor</span></code> must have the same number of elements in all processes
participating in the collective.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>tensor</strong> (<a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a>) – Data to be sent if <code class="docutils literal notranslate"><span class="pre">src</span></code> is the rank of current
process, and tensor to be used to save received data otherwise.</p></li>
<li><p><strong>src</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a>) – Source rank on global process group (regardless of <code class="docutils literal notranslate"><span class="pre">group</span></code> argument).</p></li>
<li><p><strong>group</strong> (<em>ProcessGroup</em><em>, </em><em>optional</em>) – The process group to work on. If None,
the default process group will be used.</p></li>
<li><p><strong>async_op</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>optional</em>) – Whether this op should be an async op</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>Async work handle, if async_op is set to True.
None, if not async_op or if not part of the group</p>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.broadcast_object_list">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">broadcast_object_list</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">object_list</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">group</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#broadcast_object_list"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.broadcast_object_list" title="Permalink to this definition">¶</a></dt>
<dd><p>Broadcasts picklable objects in <code class="docutils literal notranslate"><span class="pre">object_list</span></code> to the whole group.</p>
<p>Similar to <a class="reference internal" href="#torch.distributed.broadcast" title="torch.distributed.broadcast"><code class="xref py py-func docutils literal notranslate"><span class="pre">broadcast()</span></code></a>, but Python objects can be passed in.
Note that all objects in <code class="docutils literal notranslate"><span class="pre">object_list</span></code> must be picklable in order to be
broadcasted.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>object_list</strong> (<em>List</em><em>[</em><em>Any</em><em>]</em>) – List of input objects to broadcast.
Each object must be picklable. Only objects on the <code class="docutils literal notranslate"><span class="pre">src</span></code> rank will
be broadcast, but each rank must provide lists of equal sizes.</p></li>
<li><p><strong>src</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a>) – Source rank from which to broadcast <code class="docutils literal notranslate"><span class="pre">object_list</span></code>.
Source rank is based on global process group (regardless of <code class="docutils literal notranslate"><span class="pre">group</span></code> argument)</p></li>
<li><p><strong>group</strong> – (ProcessGroup, optional): The process group to work on. If None,
the default process group will be used. Default is <code class="docutils literal notranslate"><span class="pre">None</span></code>.</p></li>
<li><p><strong>device</strong> (<code class="docutils literal notranslate"><span class="pre">torch.device</span></code>, optional) – If not None, the objects are
serialized and converted to tensors which are moved to the
<code class="docutils literal notranslate"><span class="pre">device</span></code> before broadcasting. Default is <code class="docutils literal notranslate"><span class="pre">None</span></code>.</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p><code class="docutils literal notranslate"><span class="pre">None</span></code>. If rank is part of the group, <code class="docutils literal notranslate"><span class="pre">object_list</span></code> will contain the
broadcasted objects from <code class="docutils literal notranslate"><span class="pre">src</span></code> rank.</p>
</dd>
</dl>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>For NCCL-based process groups, internal tensor representations
of objects must be moved to the GPU device before communication takes
place. In this case, the device used is given by
<code class="docutils literal notranslate"><span class="pre">torch.cuda.current_device()</span></code> and it is the user’s responsibility to
ensure that this is set so that each rank has an individual GPU, via
<code class="docutils literal notranslate"><span class="pre">torch.cuda.set_device()</span></code>.</p>
</div>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>Note that this API differs slightly from the <a class="reference internal" href="#torch.distributed.broadcast" title="torch.distributed.broadcast"><code class="xref py py-func docutils literal notranslate"><span class="pre">broadcast()</span></code></a>
collective since it does not provide an <code class="docutils literal notranslate"><span class="pre">async_op</span></code> handle and thus
will be a blocking call.</p>
</div>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p><a class="reference internal" href="#torch.distributed.broadcast_object_list" title="torch.distributed.broadcast_object_list"><code class="xref py py-func docutils literal notranslate"><span class="pre">broadcast_object_list()</span></code></a> uses <code class="docutils literal notranslate"><span class="pre">pickle</span></code> module implicitly, which
is known to be insecure. It is possible to construct malicious pickle
data which will execute arbitrary code during unpickling. Only call this
function with data you trust.</p>
</div>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>Calling <a class="reference internal" href="#torch.distributed.broadcast_object_list" title="torch.distributed.broadcast_object_list"><code class="xref py py-func docutils literal notranslate"><span class="pre">broadcast_object_list()</span></code></a> with GPU tensors is not well supported
and inefficient as it incurs GPU -&gt; CPU transfer since tensors would be
pickled. Please consider using <a class="reference internal" href="#torch.distributed.broadcast" title="torch.distributed.broadcast"><code class="xref py py-func docutils literal notranslate"><span class="pre">broadcast()</span></code></a> instead.</p>
</div>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># Note: Process group initialization omitted on each rank.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">torch.distributed</span> <span class="k">as</span> <span class="nn">dist</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">if</span> <span class="n">dist</span><span class="o">.</span><span class="n">get_rank</span><span class="p">()</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="gp">&gt;&gt;&gt; </span>    <span class="c1"># Assumes world_size of 3.</span>
<span class="gp">&gt;&gt;&gt; </span>    <span class="n">objects</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;foo&quot;</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mi">2</span><span class="p">}]</span> <span class="c1"># any picklable object</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">else</span><span class="p">:</span>
<span class="gp">&gt;&gt;&gt; </span>    <span class="n">objects</span> <span class="o">=</span> <span class="p">[</span><span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Assumes backend is not NCCL</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">device</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">device</span><span class="p">(</span><span class="s2">&quot;cpu&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dist</span><span class="o">.</span><span class="n">broadcast_object_list</span><span class="p">(</span><span class="n">objects</span><span class="p">,</span> <span class="n">src</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">objects</span>
<span class="go">[&#39;foo&#39;, 12, {1: 2}]</span>
</pre></div>
</div>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.all_reduce">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">all_reduce</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">tensor</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">op=&lt;RedOpType.SUM:</span> <span class="pre">0&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">group=None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">async_op=False</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#all_reduce"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.all_reduce" title="Permalink to this definition">¶</a></dt>
<dd><p>Reduces the tensor data across all machines in a way that all get the final result.</p>
<p>After the call <code class="docutils literal notranslate"><span class="pre">tensor</span></code> is going to be bitwise identical in all processes.</p>
<p>Complex tensors are supported.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>tensor</strong> (<a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a>) – Input and output of the collective. The function
operates in-place.</p></li>
<li><p><strong>op</strong> (<em>optional</em>) – One of the values from
<code class="docutils literal notranslate"><span class="pre">torch.distributed.ReduceOp</span></code>
enum.  Specifies an operation used for element-wise reductions.</p></li>
<li><p><strong>group</strong> (<em>ProcessGroup</em><em>, </em><em>optional</em>) – The process group to work on. If None,
the default process group will be used.</p></li>
<li><p><strong>async_op</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>optional</em>) – Whether this op should be an async op</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>Async work handle, if async_op is set to True.
None, if not async_op or if not part of the group</p>
</dd>
</dl>
<p class="rubric">Examples</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># All tensors below are of torch.int64 type.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># We have 2 process groups, 2 ranks.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">device</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">device</span><span class="p">(</span><span class="sa">f</span><span class="s1">&#39;cuda:</span><span class="si">{</span><span class="n">rank</span><span class="si">}</span><span class="s1">&#39;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">int64</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">)</span> <span class="o">+</span> <span class="mi">1</span> <span class="o">+</span> <span class="mi">2</span> <span class="o">*</span> <span class="n">rank</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor</span>
<span class="go">tensor([1, 2], device=&#39;cuda:0&#39;) # Rank 0</span>
<span class="go">tensor([3, 4], device=&#39;cuda:1&#39;) # Rank 1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dist</span><span class="o">.</span><span class="n">all_reduce</span><span class="p">(</span><span class="n">tensor</span><span class="p">,</span> <span class="n">op</span><span class="o">=</span><span class="n">ReduceOp</span><span class="o">.</span><span class="n">SUM</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor</span>
<span class="go">tensor([4, 6], device=&#39;cuda:0&#39;) # Rank 0</span>
<span class="go">tensor([4, 6], device=&#39;cuda:1&#39;) # Rank 1</span>
</pre></div>
</div>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># All tensors below are of torch.cfloat type.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># We have 2 process groups, 2 ranks.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">tensor</span><span class="p">([</span><span class="mi">1</span><span class="o">+</span><span class="mi">1</span><span class="n">j</span><span class="p">,</span> <span class="mi">2</span><span class="o">+</span><span class="mi">2</span><span class="n">j</span><span class="p">],</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">cfloat</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">)</span> <span class="o">+</span> <span class="mi">2</span> <span class="o">*</span> <span class="n">rank</span> <span class="o">*</span> <span class="p">(</span><span class="mi">1</span><span class="o">+</span><span class="mi">1</span><span class="n">j</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor</span>
<span class="go">tensor([1.+1.j, 2.+2.j], device=&#39;cuda:0&#39;) # Rank 0</span>
<span class="go">tensor([3.+3.j, 4.+4.j], device=&#39;cuda:1&#39;) # Rank 1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dist</span><span class="o">.</span><span class="n">all_reduce</span><span class="p">(</span><span class="n">tensor</span><span class="p">,</span> <span class="n">op</span><span class="o">=</span><span class="n">ReduceOp</span><span class="o">.</span><span class="n">SUM</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor</span>
<span class="go">tensor([4.+4.j, 6.+6.j], device=&#39;cuda:0&#39;) # Rank 0</span>
<span class="go">tensor([4.+4.j, 6.+6.j], device=&#39;cuda:1&#39;) # Rank 1</span>
</pre></div>
</div>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.reduce">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">reduce</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">tensor</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dst</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">op=&lt;RedOpType.SUM:</span> <span class="pre">0&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">group=None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">async_op=False</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#reduce"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.reduce" title="Permalink to this definition">¶</a></dt>
<dd><p>Reduces the tensor data across all machines.</p>
<p>Only the process with rank <code class="docutils literal notranslate"><span class="pre">dst</span></code> is going to receive the final result.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>tensor</strong> (<a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a>) – Input and output of the collective. The function
operates in-place.</p></li>
<li><p><strong>dst</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a>) – Destination rank on global process group (regardless of <code class="docutils literal notranslate"><span class="pre">group</span></code> argument)</p></li>
<li><p><strong>op</strong> (<em>optional</em>) – One of the values from
<code class="docutils literal notranslate"><span class="pre">torch.distributed.ReduceOp</span></code>
enum.  Specifies an operation used for element-wise reductions.</p></li>
<li><p><strong>group</strong> (<em>ProcessGroup</em><em>, </em><em>optional</em>) – The process group to work on. If None,
the default process group will be used.</p></li>
<li><p><strong>async_op</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>optional</em>) – Whether this op should be an async op</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>Async work handle, if async_op is set to True.
None, if not async_op or if not part of the group</p>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.all_gather">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">all_gather</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">tensor_list</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tensor</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">group</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">async_op</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#all_gather"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.all_gather" title="Permalink to this definition">¶</a></dt>
<dd><p>Gathers tensors from the whole group in a list.</p>
<p>Complex and uneven sized tensors are supported.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>tensor_list</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em>[</em><a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a><em>]</em>) – Output list. It should contain
correctly-sized tensors to be used for output of the collective.
Uneven sized tensors are supported.</p></li>
<li><p><strong>tensor</strong> (<a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a>) – Tensor to be broadcast from current process.</p></li>
<li><p><strong>group</strong> (<em>ProcessGroup</em><em>, </em><em>optional</em>) – The process group to work on. If None,
the default process group will be used.</p></li>
<li><p><strong>async_op</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>optional</em>) – Whether this op should be an async op</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>Async work handle, if async_op is set to True.
None, if not async_op or if not part of the group</p>
</dd>
</dl>
<p class="rubric">Examples</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># All tensors below are of torch.int64 dtype.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># We have 2 process groups, 2 ranks.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">device</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">device</span><span class="p">(</span><span class="sa">f</span><span class="s1">&#39;cuda:</span><span class="si">{</span><span class="n">rank</span><span class="si">}</span><span class="s1">&#39;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor_list</span> <span class="o">=</span> <span class="p">[</span><span class="n">torch</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">int64</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">)</span> <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">2</span><span class="p">)]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor_list</span>
<span class="go">[tensor([0, 0], device=&#39;cuda:0&#39;), tensor([0, 0], device=&#39;cuda:0&#39;)] # Rank 0</span>
<span class="go">[tensor([0, 0], device=&#39;cuda:1&#39;), tensor([0, 0], device=&#39;cuda:1&#39;)] # Rank 1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">int64</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">)</span> <span class="o">+</span> <span class="mi">1</span> <span class="o">+</span> <span class="mi">2</span> <span class="o">*</span> <span class="n">rank</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor</span>
<span class="go">tensor([1, 2], device=&#39;cuda:0&#39;) # Rank 0</span>
<span class="go">tensor([3, 4], device=&#39;cuda:1&#39;) # Rank 1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dist</span><span class="o">.</span><span class="n">all_gather</span><span class="p">(</span><span class="n">tensor_list</span><span class="p">,</span> <span class="n">tensor</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor_list</span>
<span class="go">[tensor([1, 2], device=&#39;cuda:0&#39;), tensor([3, 4], device=&#39;cuda:0&#39;)] # Rank 0</span>
<span class="go">[tensor([1, 2], device=&#39;cuda:1&#39;), tensor([3, 4], device=&#39;cuda:1&#39;)] # Rank 1</span>
</pre></div>
</div>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># All tensors below are of torch.cfloat dtype.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># We have 2 process groups, 2 ranks.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor_list</span> <span class="o">=</span> <span class="p">[</span><span class="n">torch</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">cfloat</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">)</span> <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">2</span><span class="p">)]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor_list</span>
<span class="go">[tensor([0.+0.j, 0.+0.j], device=&#39;cuda:0&#39;), tensor([0.+0.j, 0.+0.j], device=&#39;cuda:0&#39;)] # Rank 0</span>
<span class="go">[tensor([0.+0.j, 0.+0.j], device=&#39;cuda:1&#39;), tensor([0.+0.j, 0.+0.j], device=&#39;cuda:1&#39;)] # Rank 1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">tensor</span><span class="p">([</span><span class="mi">1</span><span class="o">+</span><span class="mi">1</span><span class="n">j</span><span class="p">,</span> <span class="mi">2</span><span class="o">+</span><span class="mi">2</span><span class="n">j</span><span class="p">],</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">cfloat</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">)</span> <span class="o">+</span> <span class="mi">2</span> <span class="o">*</span> <span class="n">rank</span> <span class="o">*</span> <span class="p">(</span><span class="mi">1</span><span class="o">+</span><span class="mi">1</span><span class="n">j</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor</span>
<span class="go">tensor([1.+1.j, 2.+2.j], device=&#39;cuda:0&#39;) # Rank 0</span>
<span class="go">tensor([3.+3.j, 4.+4.j], device=&#39;cuda:1&#39;) # Rank 1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dist</span><span class="o">.</span><span class="n">all_gather</span><span class="p">(</span><span class="n">tensor_list</span><span class="p">,</span> <span class="n">tensor</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor_list</span>
<span class="go">[tensor([1.+1.j, 2.+2.j], device=&#39;cuda:0&#39;), tensor([3.+3.j, 4.+4.j], device=&#39;cuda:0&#39;)] # Rank 0</span>
<span class="go">[tensor([1.+1.j, 2.+2.j], device=&#39;cuda:1&#39;), tensor([3.+3.j, 4.+4.j], device=&#39;cuda:1&#39;)] # Rank 1</span>
</pre></div>
</div>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.all_gather_into_tensor">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">all_gather_into_tensor</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">output_tensor</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">input_tensor</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">group</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">async_op</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#all_gather_into_tensor"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.all_gather_into_tensor" title="Permalink to this definition">¶</a></dt>
<dd><p>Gather tensors from all ranks and put them in a single output tensor.</p>
<p>This function requires all tensors to be the same size on each process.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>output_tensor</strong> (<a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a>) – Output tensor to accommodate tensor elements
from all ranks. It must be correctly sized to have one of the
following forms:
(i) a concatenation of all the input tensors along the primary
dimension; for definition of “concatenation”, see <code class="docutils literal notranslate"><span class="pre">torch.cat()</span></code>;
(ii) a stack of all the input tensors along the primary dimension;
for definition of “stack”, see <code class="docutils literal notranslate"><span class="pre">torch.stack()</span></code>.
Examples below may better explain the supported output forms.</p></li>
<li><p><strong>input_tensor</strong> (<a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a>) – Tensor to be gathered from current rank.
Different from the <code class="docutils literal notranslate"><span class="pre">all_gather</span></code> API, the input tensors in this
API must have the same size across all ranks.</p></li>
<li><p><strong>group</strong> (<em>ProcessGroup</em><em>, </em><em>optional</em>) – The process group to work on. If None,
the default process group will be used.</p></li>
<li><p><strong>async_op</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>optional</em>) – Whether this op should be an async op</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>Async work handle, if async_op is set to True.
None, if not async_op or if not part of the group</p>
</dd>
</dl>
<p class="rubric">Examples</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># All tensors below are of torch.int64 dtype and on CUDA devices.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># We have two ranks.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">device</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">device</span><span class="p">(</span><span class="sa">f</span><span class="s1">&#39;cuda:</span><span class="si">{</span><span class="n">rank</span><span class="si">}</span><span class="s1">&#39;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor_in</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">int64</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">)</span> <span class="o">+</span> <span class="mi">1</span> <span class="o">+</span> <span class="mi">2</span> <span class="o">*</span> <span class="n">rank</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor_in</span>
<span class="go">tensor([1, 2], device=&#39;cuda:0&#39;) # Rank 0</span>
<span class="go">tensor([3, 4], device=&#39;cuda:1&#39;) # Rank 1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Output in concatenation form</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor_out</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">world_size</span> <span class="o">*</span> <span class="mi">2</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">int64</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dist</span><span class="o">.</span><span class="n">all_gather_into_tensor</span><span class="p">(</span><span class="n">tensor_out</span><span class="p">,</span> <span class="n">tensor_in</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor_out</span>
<span class="go">tensor([1, 2, 3, 4], device=&#39;cuda:0&#39;) # Rank 0</span>
<span class="go">tensor([1, 2, 3, 4], device=&#39;cuda:1&#39;) # Rank 1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Output in stack form</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor_out2</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">world_size</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">int64</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dist</span><span class="o">.</span><span class="n">all_gather_into_tensor</span><span class="p">(</span><span class="n">tensor_out2</span><span class="p">,</span> <span class="n">tensor_in</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor_out2</span>
<span class="go">tensor([[1, 2],</span>
<span class="go">        [3, 4]], device=&#39;cuda:0&#39;) # Rank 0</span>
<span class="go">tensor([[1, 2],</span>
<span class="go">        [3, 4]], device=&#39;cuda:1&#39;) # Rank 1</span>
</pre></div>
</div>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>The Gloo backend does not support this API.</p>
</div>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.all_gather_object">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">all_gather_object</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">object_list</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">obj</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">group</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#all_gather_object"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.all_gather_object" title="Permalink to this definition">¶</a></dt>
<dd><p>Gathers picklable objects from the whole group into a list.</p>
<p>Similar to <a class="reference internal" href="#torch.distributed.all_gather" title="torch.distributed.all_gather"><code class="xref py py-func docutils literal notranslate"><span class="pre">all_gather()</span></code></a>, but Python objects can be passed in.
Note that the object must be picklable in order to be gathered.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>object_list</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em>[</em><em>Any</em><em>]</em>) – Output list. It should be correctly sized as the
size of the group for this collective and will contain the output.</p></li>
<li><p><strong>obj</strong> (<em>Any</em>) – Pickable Python object to be broadcast from current process.</p></li>
<li><p><strong>group</strong> (<em>ProcessGroup</em><em>, </em><em>optional</em>) – The process group to work on. If None,
the default process group will be used. Default is <code class="docutils literal notranslate"><span class="pre">None</span></code>.</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>None. If the calling rank is part of this group, the output of the
collective will be populated into the input <code class="docutils literal notranslate"><span class="pre">object_list</span></code>. If the
calling rank is not part of the group, the passed in <code class="docutils literal notranslate"><span class="pre">object_list</span></code> will
be unmodified.</p>
</dd>
</dl>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>Note that this API differs slightly from the <a class="reference internal" href="#torch.distributed.all_gather" title="torch.distributed.all_gather"><code class="xref py py-func docutils literal notranslate"><span class="pre">all_gather()</span></code></a>
collective since it does not provide an <code class="docutils literal notranslate"><span class="pre">async_op</span></code> handle and thus
will be a blocking call.</p>
</div>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>For NCCL-based processed groups, internal tensor representations
of objects must be moved to the GPU device before communication takes
place. In this case, the device used is given by
<code class="docutils literal notranslate"><span class="pre">torch.cuda.current_device()</span></code> and it is the user’s responsiblity to
ensure that this is set so that each rank has an individual GPU, via
<code class="docutils literal notranslate"><span class="pre">torch.cuda.set_device()</span></code>.</p>
</div>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p><a class="reference internal" href="#torch.distributed.all_gather_object" title="torch.distributed.all_gather_object"><code class="xref py py-func docutils literal notranslate"><span class="pre">all_gather_object()</span></code></a> uses <code class="docutils literal notranslate"><span class="pre">pickle</span></code> module implicitly, which is
known to be insecure. It is possible to construct malicious pickle data
which will execute arbitrary code during unpickling. Only call this
function with data you trust.</p>
</div>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>Calling <a class="reference internal" href="#torch.distributed.all_gather_object" title="torch.distributed.all_gather_object"><code class="xref py py-func docutils literal notranslate"><span class="pre">all_gather_object()</span></code></a> with GPU tensors is not well supported
and inefficient as it incurs GPU -&gt; CPU transfer since tensors would be
pickled. Please consider using <a class="reference internal" href="#torch.distributed.all_gather" title="torch.distributed.all_gather"><code class="xref py py-func docutils literal notranslate"><span class="pre">all_gather()</span></code></a> instead.</p>
</div>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># Note: Process group initialization omitted on each rank.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">torch.distributed</span> <span class="k">as</span> <span class="nn">dist</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Assumes world_size of 3.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">gather_objects</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;foo&quot;</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mi">2</span><span class="p">}]</span> <span class="c1"># any picklable object</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">output</span> <span class="o">=</span> <span class="p">[</span><span class="kc">None</span> <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="n">gather_objects</span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dist</span><span class="o">.</span><span class="n">all_gather_object</span><span class="p">(</span><span class="n">output</span><span class="p">,</span> <span class="n">gather_objects</span><span class="p">[</span><span class="n">dist</span><span class="o">.</span><span class="n">get_rank</span><span class="p">()])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">output</span>
<span class="go">[&#39;foo&#39;, 12, {1: 2}]</span>
</pre></div>
</div>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.gather">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">gather</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">tensor</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">gather_list</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dst</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">group</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">async_op</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#gather"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.gather" title="Permalink to this definition">¶</a></dt>
<dd><p>Gathers a list of tensors in a single process.</p>
<p>This function requires all tensors to be the same size on each process.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>tensor</strong> (<a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a>) – Input tensor.</p></li>
<li><p><strong>gather_list</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em>[</em><a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a><em>]</em><em>, </em><em>optional</em>) – List of appropriately,
same-sized tensors to use for gathered data
(default is None, must be specified on the destination rank)</p></li>
<li><p><strong>dst</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>optional</em>) – Destination rank on global process group (regardless of <code class="docutils literal notranslate"><span class="pre">group</span></code> argument). (default is 0)</p></li>
<li><p><strong>group</strong> (<em>ProcessGroup</em><em>, </em><em>optional</em>) – The process group to work on. If None,
the default process group will be used.</p></li>
<li><p><strong>async_op</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>optional</em>) – Whether this op should be an async op</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>Async work handle, if async_op is set to True.
None, if not async_op or if not part of the group</p>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.gather_object">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">gather_object</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">obj</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">object_gather_list</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dst</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">group</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#gather_object"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.gather_object" title="Permalink to this definition">¶</a></dt>
<dd><p>Gathers picklable objects from the whole group in a single process.</p>
<p>Similar to <a class="reference internal" href="#torch.distributed.gather" title="torch.distributed.gather"><code class="xref py py-func docutils literal notranslate"><span class="pre">gather()</span></code></a>, but Python objects can be passed in. Note that the
object must be picklable in order to be gathered.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>obj</strong> (<em>Any</em>) – Input object. Must be picklable.</p></li>
<li><p><strong>object_gather_list</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em>[</em><em>Any</em><em>]</em>) – Output list. On the <code class="docutils literal notranslate"><span class="pre">dst</span></code> rank, it
should be correctly sized as the size of the group for this
collective and will contain the output. Must be <code class="docutils literal notranslate"><span class="pre">None</span></code> on non-dst
ranks. (default is <code class="docutils literal notranslate"><span class="pre">None</span></code>)</p></li>
<li><p><strong>dst</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>, </em><em>optional</em>) – Destination rank on global process group (regardless of <code class="docutils literal notranslate"><span class="pre">group</span></code> argument). (default is 0)</p></li>
<li><p><strong>group</strong> – (ProcessGroup, optional): The process group to work on. If None,
the default process group will be used. Default is <code class="docutils literal notranslate"><span class="pre">None</span></code>.</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>None. On the <code class="docutils literal notranslate"><span class="pre">dst</span></code> rank, <code class="docutils literal notranslate"><span class="pre">object_gather_list</span></code> will contain the
output of the collective.</p>
</dd>
</dl>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>Note that this API differs slightly from the gather collective
since it does not provide an async_op handle and thus will be a blocking
call.</p>
</div>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>For NCCL-based processed groups, internal tensor representations
of objects must be moved to the GPU device before communication takes
place. In this case, the device used is given by
<code class="docutils literal notranslate"><span class="pre">torch.cuda.current_device()</span></code> and it is the user’s responsiblity to
ensure that this is set so that each rank has an individual GPU, via
<code class="docutils literal notranslate"><span class="pre">torch.cuda.set_device()</span></code>.</p>
</div>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p><a class="reference internal" href="#torch.distributed.gather_object" title="torch.distributed.gather_object"><code class="xref py py-func docutils literal notranslate"><span class="pre">gather_object()</span></code></a> uses <code class="docutils literal notranslate"><span class="pre">pickle</span></code> module implicitly, which is
known to be insecure. It is possible to construct malicious pickle data
which will execute arbitrary code during unpickling. Only call this
function with data you trust.</p>
</div>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>Calling <a class="reference internal" href="#torch.distributed.gather_object" title="torch.distributed.gather_object"><code class="xref py py-func docutils literal notranslate"><span class="pre">gather_object()</span></code></a> with GPU tensors is not well supported
and inefficient as it incurs GPU -&gt; CPU transfer since tensors would be
pickled. Please consider using <a class="reference internal" href="#torch.distributed.gather" title="torch.distributed.gather"><code class="xref py py-func docutils literal notranslate"><span class="pre">gather()</span></code></a> instead.</p>
</div>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># Note: Process group initialization omitted on each rank.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">torch.distributed</span> <span class="k">as</span> <span class="nn">dist</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Assumes world_size of 3.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">gather_objects</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;foo&quot;</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mi">2</span><span class="p">}]</span> <span class="c1"># any picklable object</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">output</span> <span class="o">=</span> <span class="p">[</span><span class="kc">None</span> <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="n">gather_objects</span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dist</span><span class="o">.</span><span class="n">gather_object</span><span class="p">(</span>
<span class="gp">... </span>    <span class="n">gather_objects</span><span class="p">[</span><span class="n">dist</span><span class="o">.</span><span class="n">get_rank</span><span class="p">()],</span>
<span class="gp">... </span>    <span class="n">output</span> <span class="k">if</span> <span class="n">dist</span><span class="o">.</span><span class="n">get_rank</span><span class="p">()</span> <span class="o">==</span> <span class="mi">0</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span>
<span class="gp">... </span>    <span class="n">dst</span><span class="o">=</span><span class="mi">0</span>
<span class="gp">... </span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># On rank 0</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">output</span>
<span class="go">[&#39;foo&#39;, 12, {1: 2}]</span>
</pre></div>
</div>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.scatter">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">scatter</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">tensor</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">scatter_list</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">group</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">async_op</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#scatter"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.scatter" title="Permalink to this definition">¶</a></dt>
<dd><p>Scatters a list of tensors to all processes in a group.</p>
<p>Each process will receive exactly one tensor and store its data in the
<code class="docutils literal notranslate"><span class="pre">tensor</span></code> argument.</p>
<p>Complex tensors are supported.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>tensor</strong> (<a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a>) – Output tensor.</p></li>
<li><p><strong>scatter_list</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em>[</em><a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a><em>]</em>) – List of tensors to scatter (default is
None, must be specified on the source rank)</p></li>
<li><p><strong>src</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a>) – Source rank on global process group (regardless of <code class="docutils literal notranslate"><span class="pre">group</span></code> argument).
Default is 0</p></li>
<li><p><strong>group</strong> (<em>ProcessGroup</em><em>, </em><em>optional</em>) – The process group to work on. If None,
the default process group will be used.</p></li>
<li><p><strong>async_op</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>optional</em>) – Whether this op should be an async op</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>Async work handle, if async_op is set to True.
None, if not async_op or if not part of the group</p>
</dd>
</dl>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>Note that all Tensors in scatter_list must have the same size.</p>
</div>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># Note: Process group initialization omitted on each rank.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">torch.distributed</span> <span class="k">as</span> <span class="nn">dist</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor_size</span> <span class="o">=</span> <span class="mi">2</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">t_ones</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="n">tensor_size</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">t_fives</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="n">tensor_size</span><span class="p">)</span> <span class="o">*</span> <span class="mi">5</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">output_tensor</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">tensor_size</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">if</span> <span class="n">dist</span><span class="o">.</span><span class="n">get_rank</span><span class="p">()</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="gp">&gt;&gt;&gt; </span>    <span class="c1"># Assumes world_size of 2.</span>
<span class="gp">&gt;&gt;&gt; </span>    <span class="c1"># Only tensors, all of which must be the same size.</span>
<span class="gp">&gt;&gt;&gt; </span>    <span class="n">scatter_list</span> <span class="o">=</span> <span class="p">[</span><span class="n">t_ones</span><span class="p">,</span> <span class="n">t_fives</span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">else</span><span class="p">:</span>
<span class="gp">&gt;&gt;&gt; </span>    <span class="n">scatter_list</span> <span class="o">=</span> <span class="kc">None</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dist</span><span class="o">.</span><span class="n">scatter</span><span class="p">(</span><span class="n">output_tensor</span><span class="p">,</span> <span class="n">scatter_list</span><span class="p">,</span> <span class="n">src</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Rank i gets scatter_list[i]. For example, on rank 1:</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">output_tensor</span>
<span class="go">tensor([5., 5.])</span>
</pre></div>
</div>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.scatter_object_list">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">scatter_object_list</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">scatter_object_output_list</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">scatter_object_input_list</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">src</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">group</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#scatter_object_list"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.scatter_object_list" title="Permalink to this definition">¶</a></dt>
<dd><p>Scatters picklable objects in <code class="docutils literal notranslate"><span class="pre">scatter_object_input_list</span></code> to the whole group.</p>
<p>Similar to <a class="reference internal" href="#torch.distributed.scatter" title="torch.distributed.scatter"><code class="xref py py-func docutils literal notranslate"><span class="pre">scatter()</span></code></a>, but Python objects can be passed in. On
each rank, the scattered object will be stored as the first element of
<code class="docutils literal notranslate"><span class="pre">scatter_object_output_list</span></code>. Note that all objects in
<code class="docutils literal notranslate"><span class="pre">scatter_object_input_list</span></code> must be picklable in order to be scattered.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>scatter_object_output_list</strong> (<em>List</em><em>[</em><em>Any</em><em>]</em>) – Non-empty list whose first
element will store the object scattered to this rank.</p></li>
<li><p><strong>scatter_object_input_list</strong> (<em>List</em><em>[</em><em>Any</em><em>]</em>) – List of input objects to scatter.
Each object must be picklable. Only objects on the <code class="docutils literal notranslate"><span class="pre">src</span></code> rank will
be scattered, and the argument can be <code class="docutils literal notranslate"><span class="pre">None</span></code> for non-src ranks.</p></li>
<li><p><strong>src</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a>) – Source rank from which to scatter <code class="docutils literal notranslate"><span class="pre">scatter_object_input_list</span></code>.
Source rank is based on global process group (regardless of <code class="docutils literal notranslate"><span class="pre">group</span></code> argument).</p></li>
<li><p><strong>group</strong> – (ProcessGroup, optional): The process group to work on. If None,
the default process group will be used. Default is <code class="docutils literal notranslate"><span class="pre">None</span></code>.</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p><code class="docutils literal notranslate"><span class="pre">None</span></code>. If rank is part of the group, <code class="docutils literal notranslate"><span class="pre">scatter_object_output_list</span></code>
will have its first element set to the scattered object for this rank.</p>
</dd>
</dl>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>Note that this API differs slightly from the scatter collective
since it does not provide an <code class="docutils literal notranslate"><span class="pre">async_op</span></code> handle and thus will be a
blocking call.</p>
</div>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p><a class="reference internal" href="#torch.distributed.scatter_object_list" title="torch.distributed.scatter_object_list"><code class="xref py py-func docutils literal notranslate"><span class="pre">scatter_object_list()</span></code></a> uses <code class="docutils literal notranslate"><span class="pre">pickle</span></code> module implicitly, which
is known to be insecure. It is possible to construct malicious pickle
data which will execute arbitrary code during unpickling. Only call this
function with data you trust.</p>
</div>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>Calling <a class="reference internal" href="#torch.distributed.scatter_object_list" title="torch.distributed.scatter_object_list"><code class="xref py py-func docutils literal notranslate"><span class="pre">scatter_object_list()</span></code></a> with GPU tensors is not well supported
and inefficient as it incurs GPU -&gt; CPU transfer since tensors would be
pickled. Please consider using <a class="reference internal" href="#torch.distributed.scatter" title="torch.distributed.scatter"><code class="xref py py-func docutils literal notranslate"><span class="pre">scatter()</span></code></a> instead.</p>
</div>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># Note: Process group initialization omitted on each rank.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">torch.distributed</span> <span class="k">as</span> <span class="nn">dist</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">if</span> <span class="n">dist</span><span class="o">.</span><span class="n">get_rank</span><span class="p">()</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="gp">&gt;&gt;&gt; </span>    <span class="c1"># Assumes world_size of 3.</span>
<span class="gp">&gt;&gt;&gt; </span>    <span class="n">objects</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;foo&quot;</span><span class="p">,</span> <span class="mi">12</span><span class="p">,</span> <span class="p">{</span><span class="mi">1</span><span class="p">:</span> <span class="mi">2</span><span class="p">}]</span> <span class="c1"># any picklable object</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">else</span><span class="p">:</span>
<span class="gp">&gt;&gt;&gt; </span>    <span class="c1"># Can be any list on non-src ranks, elements are not used.</span>
<span class="gp">&gt;&gt;&gt; </span>    <span class="n">objects</span> <span class="o">=</span> <span class="p">[</span><span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">output_list</span> <span class="o">=</span> <span class="p">[</span><span class="kc">None</span><span class="p">]</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dist</span><span class="o">.</span><span class="n">scatter_object_list</span><span class="p">(</span><span class="n">output_list</span><span class="p">,</span> <span class="n">objects</span><span class="p">,</span> <span class="n">src</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Rank i gets objects[i]. For example, on rank 2:</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">output_list</span>
<span class="go">[{1: 2}]</span>
</pre></div>
</div>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.reduce_scatter">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">reduce_scatter</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">output</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">input_list</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">op=&lt;RedOpType.SUM:</span> <span class="pre">0&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">group=None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">async_op=False</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#reduce_scatter"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.reduce_scatter" title="Permalink to this definition">¶</a></dt>
<dd><p>Reduces, then scatters a list of tensors to all processes in a group.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>output</strong> (<a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a>) – Output tensor.</p></li>
<li><p><strong>input_list</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em>[</em><a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a><em>]</em>) – List of tensors to reduce and scatter.</p></li>
<li><p><strong>op</strong> (<em>optional</em>) – One of the values from
<code class="docutils literal notranslate"><span class="pre">torch.distributed.ReduceOp</span></code>
enum.  Specifies an operation used for element-wise reductions.</p></li>
<li><p><strong>group</strong> (<em>ProcessGroup</em><em>, </em><em>optional</em>) – The process group to work on. If None,
the default process group will be used.</p></li>
<li><p><strong>async_op</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>optional</em>) – Whether this op should be an async op.</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>Async work handle, if async_op is set to True.
None, if not async_op or if not part of the group.</p>
</dd>
</dl>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.reduce_scatter_tensor">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">reduce_scatter_tensor</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">output</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">input</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">op=&lt;RedOpType.SUM:</span> <span class="pre">0&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">group=None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">async_op=False</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#reduce_scatter_tensor"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.reduce_scatter_tensor" title="Permalink to this definition">¶</a></dt>
<dd><p>Reduces, then scatters a tensor to all ranks in a group.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>output</strong> (<a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a>) – Output tensor. It should have the same size across all
ranks.</p></li>
<li><p><strong>input</strong> (<a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a>) – Input tensor to be reduced and scattered. Its size
should be output tensor size times the world size. The input tensor
can have one of the following shapes:
(i) a concatenation of the output tensors along the primary
dimension, or
(ii) a stack of the output tensors along the primary dimension.
For definition of “concatenation”, see <code class="docutils literal notranslate"><span class="pre">torch.cat()</span></code>.
For definition of “stack”, see <code class="docutils literal notranslate"><span class="pre">torch.stack()</span></code>.</p></li>
<li><p><strong>group</strong> (<em>ProcessGroup</em><em>, </em><em>optional</em>) – The process group to work on. If None,
the default process group will be used.</p></li>
<li><p><strong>async_op</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>optional</em>) – Whether this op should be an async op.</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>Async work handle, if async_op is set to True.
None, if not async_op or if not part of the group.</p>
</dd>
</dl>
<p class="rubric">Examples</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># All tensors below are of torch.int64 dtype and on CUDA devices.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># We have two ranks.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">device</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">device</span><span class="p">(</span><span class="sa">f</span><span class="s1">&#39;cuda:</span><span class="si">{</span><span class="n">rank</span><span class="si">}</span><span class="s1">&#39;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor_out</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">int64</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Input in concatenation form</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor_in</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="n">world_size</span> <span class="o">*</span> <span class="mi">2</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">int64</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">device</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor_in</span>
<span class="go">tensor([0, 1, 2, 3], device=&#39;cuda:0&#39;) # Rank 0</span>
<span class="go">tensor([0, 1, 2, 3], device=&#39;cuda:1&#39;) # Rank 1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dist</span><span class="o">.</span><span class="n">reduce_scatter_tensor</span><span class="p">(</span><span class="n">tensor_out</span><span class="p">,</span> <span class="n">tensor_in</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor_out</span>
<span class="go">tensor([0, 2], device=&#39;cuda:0&#39;) # Rank 0</span>
<span class="go">tensor([4, 6], device=&#39;cuda:1&#39;) # Rank 1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Input in stack form</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor_in</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="n">tensor_in</span><span class="p">,</span> <span class="p">(</span><span class="n">world_size</span><span class="p">,</span> <span class="mi">2</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor_in</span>
<span class="go">tensor([[0, 1],</span>
<span class="go">        [2, 3]], device=&#39;cuda:0&#39;) # Rank 0</span>
<span class="go">tensor([[0, 1],</span>
<span class="go">        [2, 3]], device=&#39;cuda:1&#39;) # Rank 1</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dist</span><span class="o">.</span><span class="n">reduce_scatter_tensor</span><span class="p">(</span><span class="n">tensor_out</span><span class="p">,</span> <span class="n">tensor_in</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">tensor_out</span>
<span class="go">tensor([0, 2], device=&#39;cuda:0&#39;) # Rank 0</span>
<span class="go">tensor([4, 6], device=&#39;cuda:1&#39;) # Rank 1</span>
</pre></div>
</div>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>The Gloo backend does not support this API.</p>
</div>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.all_to_all_single">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">all_to_all_single</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">output</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">input</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">output_split_sizes</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">input_split_sizes</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">group</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">async_op</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#all_to_all_single"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.all_to_all_single" title="Permalink to this definition">¶</a></dt>
<dd><p>Split input tensor and then scatter the split list to all processes in a group.</p>
<p>Later the received tensors are concatenated from all the processes in the group
and returned as a single output tensor.</p>
<p>Complex tensors are supported.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>output</strong> (<a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a>) – Gathered concatenated output tensor.</p></li>
<li><p><strong>input</strong> (<a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a>) – Input tensor to scatter.</p></li>
<li><p><strong>output_split_sizes</strong> – (list[Int], optional): Output split sizes for dim 0
if specified None or empty, dim 0 of <code class="docutils literal notranslate"><span class="pre">output</span></code> tensor must divide
equally by <code class="docutils literal notranslate"><span class="pre">world_size</span></code>.</p></li>
<li><p><strong>input_split_sizes</strong> – (list[Int], optional): Input split sizes for dim 0
if specified None or empty, dim 0 of <code class="docutils literal notranslate"><span class="pre">input</span></code> tensor must divide
equally by <code class="docutils literal notranslate"><span class="pre">world_size</span></code>.</p></li>
<li><p><strong>group</strong> (<em>ProcessGroup</em><em>, </em><em>optional</em>) – The process group to work on. If None,
the default process group will be used.</p></li>
<li><p><strong>async_op</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>optional</em>) – Whether this op should be an async op.</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>Async work handle, if async_op is set to True.
None, if not async_op or if not part of the group.</p>
</dd>
</dl>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p><cite>all_to_all_single</cite> is experimental and subject to change.</p>
</div>
<p class="rubric">Examples</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="nb">input</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">4</span><span class="p">)</span> <span class="o">+</span> <span class="n">rank</span> <span class="o">*</span> <span class="mi">4</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">input</span>
<span class="go">tensor([0, 1, 2, 3])     # Rank 0</span>
<span class="go">tensor([4, 5, 6, 7])     # Rank 1</span>
<span class="go">tensor([8, 9, 10, 11])   # Rank 2</span>
<span class="go">tensor([12, 13, 14, 15]) # Rank 3</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">output</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">empty</span><span class="p">([</span><span class="mi">4</span><span class="p">],</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">int64</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dist</span><span class="o">.</span><span class="n">all_to_all_single</span><span class="p">(</span><span class="n">output</span><span class="p">,</span> <span class="nb">input</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">output</span>
<span class="go">tensor([0, 4, 8, 12])    # Rank 0</span>
<span class="go">tensor([1, 5, 9, 13])    # Rank 1</span>
<span class="go">tensor([2, 6, 10, 14])   # Rank 2</span>
<span class="go">tensor([3, 7, 11, 15])   # Rank 3</span>
</pre></div>
</div>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># Essentially, it is similar to following operation:</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">scatter_list</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">input</span><span class="o">.</span><span class="n">chunk</span><span class="p">(</span><span class="n">world_size</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">gather_list</span>  <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">output</span><span class="o">.</span><span class="n">chunk</span><span class="p">(</span><span class="n">world_size</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">world_size</span><span class="p">):</span>
<span class="gp">&gt;&gt;&gt; </span>    <span class="n">dist</span><span class="o">.</span><span class="n">scatter</span><span class="p">(</span><span class="n">gather_list</span><span class="p">[</span><span class="n">i</span><span class="p">],</span> <span class="n">scatter_list</span> <span class="k">if</span> <span class="n">i</span> <span class="o">==</span> <span class="n">rank</span> <span class="k">else</span> <span class="p">[],</span> <span class="n">src</span> <span class="o">=</span> <span class="n">i</span><span class="p">)</span>
</pre></div>
</div>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># Another example with uneven split</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">input</span>
<span class="go">tensor([0, 1, 2, 3, 4, 5])                                       # Rank 0</span>
<span class="go">tensor([10, 11, 12, 13, 14, 15, 16, 17, 18])                     # Rank 1</span>
<span class="go">tensor([20, 21, 22, 23, 24])                                     # Rank 2</span>
<span class="go">tensor([30, 31, 32, 33, 34, 35, 36])                             # Rank 3</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">input_splits</span>
<span class="go">[2, 2, 1, 1]                                                     # Rank 0</span>
<span class="go">[3, 2, 2, 2]                                                     # Rank 1</span>
<span class="go">[2, 1, 1, 1]                                                     # Rank 2</span>
<span class="go">[2, 2, 2, 1]                                                     # Rank 3</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">output_splits</span>
<span class="go">[2, 3, 2, 2]                                                     # Rank 0</span>
<span class="go">[2, 2, 1, 2]                                                     # Rank 1</span>
<span class="go">[1, 2, 1, 2]                                                     # Rank 2</span>
<span class="go">[1, 2, 1, 1]                                                     # Rank 3</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">output</span> <span class="o">=</span> <span class="o">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dist</span><span class="o">.</span><span class="n">all_to_all_single</span><span class="p">(</span><span class="n">output</span><span class="p">,</span> <span class="nb">input</span><span class="p">,</span> <span class="n">output_splits</span><span class="p">,</span> <span class="n">input_splits</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">output</span>
<span class="go">tensor([ 0,  1, 10, 11, 12, 20, 21, 30, 31])                     # Rank 0</span>
<span class="go">tensor([ 2,  3, 13, 14, 22, 32, 33])                             # Rank 1</span>
<span class="go">tensor([ 4, 15, 16, 23, 34, 35])                                 # Rank 2</span>
<span class="go">tensor([ 5, 17, 18, 24, 36])                                     # Rank 3</span>
</pre></div>
</div>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># Another example with tensors of torch.cfloat type.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">input</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">tensor</span><span class="p">([</span><span class="mi">1</span><span class="o">+</span><span class="mi">1</span><span class="n">j</span><span class="p">,</span> <span class="mi">2</span><span class="o">+</span><span class="mi">2</span><span class="n">j</span><span class="p">,</span> <span class="mi">3</span><span class="o">+</span><span class="mi">3</span><span class="n">j</span><span class="p">,</span> <span class="mi">4</span><span class="o">+</span><span class="mi">4</span><span class="n">j</span><span class="p">],</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">cfloat</span><span class="p">)</span> <span class="o">+</span> <span class="mi">4</span> <span class="o">*</span> <span class="n">rank</span> <span class="o">*</span> <span class="p">(</span><span class="mi">1</span><span class="o">+</span><span class="mi">1</span><span class="n">j</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">input</span>
<span class="go">tensor([1+1j, 2+2j, 3+3j, 4+4j])                                # Rank 0</span>
<span class="go">tensor([5+5j, 6+6j, 7+7j, 8+8j])                                # Rank 1</span>
<span class="go">tensor([9+9j, 10+10j, 11+11j, 12+12j])                          # Rank 2</span>
<span class="go">tensor([13+13j, 14+14j, 15+15j, 16+16j])                        # Rank 3</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">output</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">empty</span><span class="p">([</span><span class="mi">4</span><span class="p">],</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">int64</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dist</span><span class="o">.</span><span class="n">all_to_all_single</span><span class="p">(</span><span class="n">output</span><span class="p">,</span> <span class="nb">input</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">output</span>
<span class="go">tensor([1+1j, 5+5j, 9+9j, 13+13j])                              # Rank 0</span>
<span class="go">tensor([2+2j, 6+6j, 10+10j, 14+14j])                            # Rank 1</span>
<span class="go">tensor([3+3j, 7+7j, 11+11j, 15+15j])                            # Rank 2</span>
<span class="go">tensor([4+4j, 8+8j, 12+12j, 16+16j])                            # Rank 3</span>
</pre></div>
</div>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.all_to_all">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">all_to_all</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">output_tensor_list</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">input_tensor_list</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">group</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">async_op</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#all_to_all"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.all_to_all" title="Permalink to this definition">¶</a></dt>
<dd><p>Scatters list of input tensors to all processes in a group and return gathered list of tensors in output list.</p>
<p>Complex tensors are supported.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>output_tensor_list</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em>[</em><a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a><em>]</em>) – List of tensors to be gathered one
per rank.</p></li>
<li><p><strong>input_tensor_list</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#list" title="(in Python v3.13)"><em>list</em></a><em>[</em><a class="reference internal" href="tensors.html#torch.Tensor" title="torch.Tensor"><em>Tensor</em></a><em>]</em>) – List of tensors to scatter one per rank.</p></li>
<li><p><strong>group</strong> (<em>ProcessGroup</em><em>, </em><em>optional</em>) – The process group to work on. If None,
the default process group will be used.</p></li>
<li><p><strong>async_op</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>optional</em>) – Whether this op should be an async op.</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>Async work handle, if async_op is set to True.
None, if not async_op or if not part of the group.</p>
</dd>
</dl>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p><cite>all_to_all</cite> is experimental and subject to change.</p>
</div>
<p class="rubric">Examples</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="nb">input</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">4</span><span class="p">)</span> <span class="o">+</span> <span class="n">rank</span> <span class="o">*</span> <span class="mi">4</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">input</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">input</span><span class="o">.</span><span class="n">chunk</span><span class="p">(</span><span class="mi">4</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">input</span>
<span class="go">[tensor([0]), tensor([1]), tensor([2]), tensor([3])]     # Rank 0</span>
<span class="go">[tensor([4]), tensor([5]), tensor([6]), tensor([7])]     # Rank 1</span>
<span class="go">[tensor([8]), tensor([9]), tensor([10]), tensor([11])]   # Rank 2</span>
<span class="go">[tensor([12]), tensor([13]), tensor([14]), tensor([15])] # Rank 3</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">output</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">empty</span><span class="p">([</span><span class="mi">4</span><span class="p">],</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">int64</span><span class="p">)</span><span class="o">.</span><span class="n">chunk</span><span class="p">(</span><span class="mi">4</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dist</span><span class="o">.</span><span class="n">all_to_all</span><span class="p">(</span><span class="n">output</span><span class="p">,</span> <span class="nb">input</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">output</span>
<span class="go">[tensor([0]), tensor([4]), tensor([8]), tensor([12])]    # Rank 0</span>
<span class="go">[tensor([1]), tensor([5]), tensor([9]), tensor([13])]    # Rank 1</span>
<span class="go">[tensor([2]), tensor([6]), tensor([10]), tensor([14])]   # Rank 2</span>
<span class="go">[tensor([3]), tensor([7]), tensor([11]), tensor([15])]   # Rank 3</span>
</pre></div>
</div>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># Essentially, it is similar to following operation:</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">scatter_list</span> <span class="o">=</span> <span class="nb">input</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">gather_list</span>  <span class="o">=</span> <span class="n">output</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">world_size</span><span class="p">):</span>
<span class="gp">&gt;&gt;&gt; </span>    <span class="n">dist</span><span class="o">.</span><span class="n">scatter</span><span class="p">(</span><span class="n">gather_list</span><span class="p">[</span><span class="n">i</span><span class="p">],</span> <span class="n">scatter_list</span> <span class="k">if</span> <span class="n">i</span> <span class="o">==</span> <span class="n">rank</span> <span class="k">else</span> <span class="p">[],</span> <span class="n">src</span><span class="o">=</span><span class="n">i</span><span class="p">)</span>
</pre></div>
</div>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="nb">input</span>
<span class="go">tensor([0, 1, 2, 3, 4, 5])                                       # Rank 0</span>
<span class="go">tensor([10, 11, 12, 13, 14, 15, 16, 17, 18])                     # Rank 1</span>
<span class="go">tensor([20, 21, 22, 23, 24])                                     # Rank 2</span>
<span class="go">tensor([30, 31, 32, 33, 34, 35, 36])                             # Rank 3</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">input_splits</span>
<span class="go">[2, 2, 1, 1]                                                     # Rank 0</span>
<span class="go">[3, 2, 2, 2]                                                     # Rank 1</span>
<span class="go">[2, 1, 1, 1]                                                     # Rank 2</span>
<span class="go">[2, 2, 2, 1]                                                     # Rank 3</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">output_splits</span>
<span class="go">[2, 3, 2, 2]                                                     # Rank 0</span>
<span class="go">[2, 2, 1, 2]                                                     # Rank 1</span>
<span class="go">[1, 2, 1, 2]                                                     # Rank 2</span>
<span class="go">[1, 2, 1, 1]                                                     # Rank 3</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">input</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">input</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">input_splits</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">input</span>
<span class="go">[tensor([0, 1]), tensor([2, 3]), tensor([4]), tensor([5])]                   # Rank 0</span>
<span class="go">[tensor([10, 11, 12]), tensor([13, 14]), tensor([15, 16]), tensor([17, 18])] # Rank 1</span>
<span class="go">[tensor([20, 21]), tensor([22]), tensor([23]), tensor([24])]                 # Rank 2</span>
<span class="go">[tensor([30, 31]), tensor([32, 33]), tensor([34, 35]), tensor([36])]         # Rank 3</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">output</span> <span class="o">=</span> <span class="o">...</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dist</span><span class="o">.</span><span class="n">all_to_all</span><span class="p">(</span><span class="n">output</span><span class="p">,</span> <span class="nb">input</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">output</span>
<span class="go">[tensor([0, 1]), tensor([10, 11, 12]), tensor([20, 21]), tensor([30, 31])]   # Rank 0</span>
<span class="go">[tensor([2, 3]), tensor([13, 14]), tensor([22]), tensor([32, 33])]           # Rank 1</span>
<span class="go">[tensor([4]), tensor([15, 16]), tensor([23]), tensor([34, 35])]              # Rank 2</span>
<span class="go">[tensor([5]), tensor([17, 18]), tensor([24]), tensor([36])]                  # Rank 3</span>
</pre></div>
</div>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># Another example with tensors of torch.cfloat type.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">input</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">tensor</span><span class="p">([</span><span class="mi">1</span><span class="o">+</span><span class="mi">1</span><span class="n">j</span><span class="p">,</span> <span class="mi">2</span><span class="o">+</span><span class="mi">2</span><span class="n">j</span><span class="p">,</span> <span class="mi">3</span><span class="o">+</span><span class="mi">3</span><span class="n">j</span><span class="p">,</span> <span class="mi">4</span><span class="o">+</span><span class="mi">4</span><span class="n">j</span><span class="p">],</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">cfloat</span><span class="p">)</span> <span class="o">+</span> <span class="mi">4</span> <span class="o">*</span> <span class="n">rank</span> <span class="o">*</span> <span class="p">(</span><span class="mi">1</span><span class="o">+</span><span class="mi">1</span><span class="n">j</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">input</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">input</span><span class="o">.</span><span class="n">chunk</span><span class="p">(</span><span class="mi">4</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="nb">input</span>
<span class="go">[tensor([1+1j]), tensor([2+2j]), tensor([3+3j]), tensor([4+4j])]            # Rank 0</span>
<span class="go">[tensor([5+5j]), tensor([6+6j]), tensor([7+7j]), tensor([8+8j])]            # Rank 1</span>
<span class="go">[tensor([9+9j]), tensor([10+10j]), tensor([11+11j]), tensor([12+12j])]      # Rank 2</span>
<span class="go">[tensor([13+13j]), tensor([14+14j]), tensor([15+15j]), tensor([16+16j])]    # Rank 3</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">output</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">empty</span><span class="p">([</span><span class="mi">4</span><span class="p">],</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">int64</span><span class="p">)</span><span class="o">.</span><span class="n">chunk</span><span class="p">(</span><span class="mi">4</span><span class="p">))</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dist</span><span class="o">.</span><span class="n">all_to_all</span><span class="p">(</span><span class="n">output</span><span class="p">,</span> <span class="nb">input</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">output</span>
<span class="go">[tensor([1+1j]), tensor([5+5j]), tensor([9+9j]), tensor([13+13j])]          # Rank 0</span>
<span class="go">[tensor([2+2j]), tensor([6+6j]), tensor([10+10j]), tensor([14+14j])]        # Rank 1</span>
<span class="go">[tensor([3+3j]), tensor([7+7j]), tensor([11+11j]), tensor([15+15j])]        # Rank 2</span>
<span class="go">[tensor([4+4j]), tensor([8+8j]), tensor([12+12j]), tensor([16+16j])]        # Rank 3</span>
</pre></div>
</div>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.barrier">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">barrier</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">group</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">async_op</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">device_ids</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#barrier"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.barrier" title="Permalink to this definition">¶</a></dt>
<dd><p>Synchronize all processes.</p>
<p>This collective blocks processes until the whole group enters this function,
if async_op is False, or if async work handle is called on wait().</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>group</strong> (<em>ProcessGroup</em><em>, </em><em>optional</em>) – The process group to work on. If None,
the default process group will be used.</p></li>
<li><p><strong>async_op</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>optional</em>) – Whether this op should be an async op</p></li>
<li><p><strong>device_ids</strong> (<em>[</em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a><em>]</em><em>, </em><em>optional</em>) – List of device/GPU ids.</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p>Async work handle, if async_op is set to True.
None, if not async_op or if not part of the group</p>
</dd>
</dl>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p><cite>ProcessGroupNCCL</cite> now relies on stream synchronization instead of
device synchronization to block the CPU. Thus, please do not assume that
<cite>barrier()</cite> would perform a device synchronization.</p>
</div>
</dd></dl>

<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.monitored_barrier">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">monitored_barrier</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">group</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">timeout</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">wait_all_ranks</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed/distributed_c10d.html#monitored_barrier"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.monitored_barrier" title="Permalink to this definition">¶</a></dt>
<dd><p>Synchronize processes similar to <code class="docutils literal notranslate"><span class="pre">torch.distributed.barrier</span></code>, but consider a configurable timeout.</p>
<p>It is able to report ranks that did not pass this barrier within the provided timeout.
Specifically, for non-zero ranks, will block until a send/recv is processed from rank 0.
Rank 0 will block until all send /recv from other ranks are processed, and will report
failures for ranks that failed to respond in time. Note that if one rank does not reach the
monitored_barrier (for example due to a hang), all other ranks would fail in monitored_barrier.</p>
<p>This collective will block all processes/ranks in the group, until the
whole group exits the function successfully, making it useful for debugging
and synchronizing. However, it can have a performance impact and should only
be used for debugging or scenarios that require full synchronization points
on the host-side. For debugging purposes, this barrier can be inserted
before the application’s collective calls to check if any ranks are
desynchronized.</p>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>Note that this collective is only supported with the GLOO backend.</p>
</div>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>group</strong> (<em>ProcessGroup</em><em>, </em><em>optional</em>) – The process group to work on. If
<code class="docutils literal notranslate"><span class="pre">None</span></code>, the default process group will be used.</p></li>
<li><p><strong>timeout</strong> (<a class="reference external" href="https://docs.python.org/3/library/datetime.html#datetime.timedelta" title="(in Python v3.13)"><em>datetime.timedelta</em></a><em>, </em><em>optional</em>) – Timeout for monitored_barrier.
If <code class="docutils literal notranslate"><span class="pre">None</span></code>, the default process group timeout will be used.</p></li>
<li><p><strong>wait_all_ranks</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><em>bool</em></a><em>, </em><em>optional</em>) – Whether to collect all failed ranks or
not. By default, this is <code class="docutils literal notranslate"><span class="pre">False</span></code> and <code class="docutils literal notranslate"><span class="pre">monitored_barrier</span></code> on rank 0
will throw on the first failed rank it encounters in order to fail
fast. By setting <code class="docutils literal notranslate"><span class="pre">wait_all_ranks=True</span></code> <code class="docutils literal notranslate"><span class="pre">monitored_barrier</span></code> will
collect all failed ranks and throw an error containing information
about all failed ranks.</p></li>
</ul>
</dd>
<dt class="field-even">Returns</dt>
<dd class="field-even"><p><code class="docutils literal notranslate"><span class="pre">None</span></code>.</p>
</dd>
</dl>
<dl>
<dt>Example::</dt><dd><div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="c1"># Note: Process group initialization omitted on each rank.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">torch.distributed</span> <span class="k">as</span> <span class="nn">dist</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">if</span> <span class="n">dist</span><span class="o">.</span><span class="n">get_rank</span><span class="p">()</span> <span class="o">!=</span> <span class="mi">1</span><span class="p">:</span>
<span class="gp">&gt;&gt;&gt; </span>    <span class="n">dist</span><span class="o">.</span><span class="n">monitored_barrier</span><span class="p">()</span> <span class="c1"># Raises exception indicating that</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># rank 1 did not call into monitored_barrier.</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># Example with wait_all_ranks=True</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">if</span> <span class="n">dist</span><span class="o">.</span><span class="n">get_rank</span><span class="p">()</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="gp">&gt;&gt;&gt; </span>    <span class="n">dist</span><span class="o">.</span><span class="n">monitored_barrier</span><span class="p">(</span><span class="n">wait_all_ranks</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> <span class="c1"># Raises exception</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># indicating that ranks 1, 2, ... world_size - 1 did not call into</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1"># monitored_barrier.</span>
</pre></div>
</div>
</dd>
</dl>
</dd></dl>

<dl class="py class">
<dt class="sig sig-object py" id="torch.distributed.Work">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">Work</span></span><a class="headerlink" href="#torch.distributed.Work" title="Permalink to this definition">¶</a></dt>
<dd><p>A <cite>Work</cite> object represents the handle to a pending asynchronous operation in
PyTorch’s distributed package. It is returned by non-blocking collective operations,
such as <cite>dist.all_reduce(tensor, async_op=True)</cite>.</p>
</dd></dl>

<dl class="py class">
<dt class="sig sig-object py" id="torch.distributed.ReduceOp">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">ReduceOp</span></span><a class="headerlink" href="#torch.distributed.ReduceOp" title="Permalink to this definition">¶</a></dt>
<dd><p>An enum-like class for available reduction operations: <code class="docutils literal notranslate"><span class="pre">SUM</span></code>, <code class="docutils literal notranslate"><span class="pre">PRODUCT</span></code>,
<code class="docutils literal notranslate"><span class="pre">MIN</span></code>, <code class="docutils literal notranslate"><span class="pre">MAX</span></code>, <code class="docutils literal notranslate"><span class="pre">BAND</span></code>, <code class="docutils literal notranslate"><span class="pre">BOR</span></code>, <code class="docutils literal notranslate"><span class="pre">BXOR</span></code>, and <code class="docutils literal notranslate"><span class="pre">PREMUL_SUM</span></code>.</p>
<p><code class="docutils literal notranslate"><span class="pre">BAND</span></code>, <code class="docutils literal notranslate"><span class="pre">BOR</span></code>, and <code class="docutils literal notranslate"><span class="pre">BXOR</span></code> reductions are not available when
using the <code class="docutils literal notranslate"><span class="pre">NCCL</span></code> backend.</p>
<p><code class="docutils literal notranslate"><span class="pre">AVG</span></code> divides values by the world size before summing across ranks.
<code class="docutils literal notranslate"><span class="pre">AVG</span></code> is only available with the <code class="docutils literal notranslate"><span class="pre">NCCL</span></code> backend,
and only for NCCL versions 2.10 or later.</p>
<p><code class="docutils literal notranslate"><span class="pre">PREMUL_SUM</span></code> multiplies inputs by a given scalar locally before reduction.
<code class="docutils literal notranslate"><span class="pre">PREMUL_SUM</span></code> is only available with the <code class="docutils literal notranslate"><span class="pre">NCCL</span></code> backend,
and only available for NCCL versions 2.11 or later. Users are supposed to
use <code class="docutils literal notranslate"><span class="pre">torch.distributed._make_nccl_premul_sum</span></code>.</p>
<p>Additionally, <code class="docutils literal notranslate"><span class="pre">MAX</span></code>, <code class="docutils literal notranslate"><span class="pre">MIN</span></code> and <code class="docutils literal notranslate"><span class="pre">PRODUCT</span></code> are not supported for complex tensors.</p>
<p>The values of this class can be accessed as attributes, e.g., <code class="docutils literal notranslate"><span class="pre">ReduceOp.SUM</span></code>.
They are used in specifying strategies for reduction collectives, e.g.,
<a class="reference internal" href="#torch.distributed.reduce" title="torch.distributed.reduce"><code class="xref py py-func docutils literal notranslate"><span class="pre">reduce()</span></code></a>.</p>
<p>This class does not support <code class="docutils literal notranslate"><span class="pre">__members__</span></code> property.</p>
</dd></dl>

<dl class="py class">
<dt class="sig sig-object py" id="torch.distributed.reduce_op">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">reduce_op</span></span><a class="headerlink" href="#torch.distributed.reduce_op" title="Permalink to this definition">¶</a></dt>
<dd><p>Deprecated enum-like class for reduction operations: <code class="docutils literal notranslate"><span class="pre">SUM</span></code>, <code class="docutils literal notranslate"><span class="pre">PRODUCT</span></code>,
<code class="docutils literal notranslate"><span class="pre">MIN</span></code>, and <code class="docutils literal notranslate"><span class="pre">MAX</span></code>.</p>
<p><a class="reference internal" href="#torch.distributed.ReduceOp" title="torch.distributed.ReduceOp"><code class="xref py py-class docutils literal notranslate"><span class="pre">ReduceOp</span></code></a> is recommended to use instead.</p>
</dd></dl>

</div>
<div class="section" id="profiling-collective-communication">
<h2>Profiling Collective Communication<a class="headerlink" href="#profiling-collective-communication" title="Permalink to this heading">¶</a></h2>
<p>Note that you can use <code class="docutils literal notranslate"><span class="pre">torch.profiler</span></code> (recommended, only available after 1.8.1)  or <code class="docutils literal notranslate"><span class="pre">torch.autograd.profiler</span></code> to profile collective communication and point-to-point communication APIs mentioned here. All out-of-the-box backends (<code class="docutils literal notranslate"><span class="pre">gloo</span></code>,
<code class="docutils literal notranslate"><span class="pre">nccl</span></code>, <code class="docutils literal notranslate"><span class="pre">mpi</span></code>) are supported and collective communication usage will be rendered as expected in profiling output/traces. Profiling your code is the same as any regular torch operator:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">torch</span>
<span class="kn">import</span> <span class="nn">torch.distributed</span> <span class="k">as</span> <span class="nn">dist</span>
<span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">profiler</span><span class="p">():</span>
    <span class="n">tensor</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">20</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span>
    <span class="n">dist</span><span class="o">.</span><span class="n">all_reduce</span><span class="p">(</span><span class="n">tensor</span><span class="p">)</span>
</pre></div>
</div>
<p>Please refer to the <a class="reference external" href="https://pytorch.org/docs/main/profiler.html">profiler documentation</a> for a full overview of profiler features.</p>
</div>
<div class="section" id="multi-gpu-collective-functions">
<h2>Multi-GPU collective functions<a class="headerlink" href="#multi-gpu-collective-functions" title="Permalink to this heading">¶</a></h2>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>The multi-GPU functions (which stand for multiple GPUs per CPU thread) are
deprecated. As of today, PyTorch Distributed’s preferred programming model
is one device per thread, as exemplified by the APIs in this document. If
you are a backend developer and want to support multiple devices per thread,
please contact PyTorch Distributed’s maintainers.</p>
</div>
</div>
<div class="section" id="third-party-backends">
<span id="distributed-launch"></span><h2>Third-party backends<a class="headerlink" href="#third-party-backends" title="Permalink to this heading">¶</a></h2>
<p>Besides the builtin GLOO/MPI/NCCL backends, PyTorch distributed supports
third-party backends through a run-time register mechanism.
For references on how to develop a third-party backend through C++ Extension,
please refer to <a class="reference external" href="https://pytorch.org/tutorials/advanced/cpp_extension.html">Tutorials - Custom C++ and CUDA Extensions</a> and
<code class="docutils literal notranslate"><span class="pre">test/cpp_extensions/cpp_c10d_extension.cpp</span></code>. The capability of third-party
backends are decided by their own implementations.</p>
<p>The new backend derives from <code class="docutils literal notranslate"><span class="pre">c10d::ProcessGroup</span></code> and registers the backend
name and the instantiating interface through <a class="reference internal" href="#torch.distributed.Backend.register_backend" title="torch.distributed.Backend.register_backend"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.distributed.Backend.register_backend()</span></code></a>
when imported.</p>
<p>When manually importing this backend and invoking <a class="reference internal" href="#torch.distributed.init_process_group" title="torch.distributed.init_process_group"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.distributed.init_process_group()</span></code></a>
with the corresponding backend name, the <code class="docutils literal notranslate"><span class="pre">torch.distributed</span></code> package runs on
the new backend.</p>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>The support of third-party backend is experimental and subject to change.</p>
</div>
</div>
<div class="section" id="launch-utility">
<h2>Launch utility<a class="headerlink" href="#launch-utility" title="Permalink to this heading">¶</a></h2>
<p>The <cite>torch.distributed</cite> package also provides a launch utility in
<cite>torch.distributed.launch</cite>. This helper utility can be used to launch
multiple processes per node for distributed training.</p>
<span class="target" id="module-torch.distributed.launch"></span><p>Module <code class="docutils literal notranslate"><span class="pre">torch.distributed.launch</span></code>.</p>
<p><code class="docutils literal notranslate"><span class="pre">torch.distributed.launch</span></code> is a module that spawns up multiple distributed
training processes on each of the training nodes.</p>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p>This module is going to be deprecated in favor of <a class="reference internal" href="elastic/run.html#launcher-api"><span class="std std-ref">torchrun</span></a>.</p>
</div>
<p>The utility can be used for single-node distributed training, in which one or
more processes per node will be spawned. The utility can be used for either
CPU training or GPU training. If the utility is used for GPU training,
each distributed process will be operating on a single GPU. This can achieve
well-improved single-node training performance. It can also be used in
multi-node distributed training, by spawning up multiple processes on each node
for well-improved multi-node distributed training performance as well.
This will especially be beneficial for systems with multiple Infiniband
interfaces that have direct-GPU support, since all of them can be utilized for
aggregated communication bandwidth.</p>
<p>In both cases of single-node distributed training or multi-node distributed
training, this utility will launch the given number of processes per node
(<code class="docutils literal notranslate"><span class="pre">--nproc-per-node</span></code>). If used for GPU training, this number needs to be less
or equal to the number of GPUs on the current system (<code class="docutils literal notranslate"><span class="pre">nproc_per_node</span></code>),
and each process will be operating on a single GPU from <em>GPU 0 to
GPU (nproc_per_node - 1)</em>.</p>
<p><strong>How to use this module:</strong></p>
<ol class="arabic simple">
<li><p>Single-Node multi-process distributed training</p></li>
</ol>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">python</span> <span class="o">-</span><span class="n">m</span> <span class="n">torch</span><span class="o">.</span><span class="n">distributed</span><span class="o">.</span><span class="n">launch</span> <span class="o">--</span><span class="n">nproc</span><span class="o">-</span><span class="n">per</span><span class="o">-</span><span class="n">node</span><span class="o">=</span><span class="n">NUM_GPUS_YOU_HAVE</span>
           <span class="n">YOUR_TRAINING_SCRIPT</span><span class="o">.</span><span class="n">py</span> <span class="p">(</span><span class="o">--</span><span class="n">arg1</span> <span class="o">--</span><span class="n">arg2</span> <span class="o">--</span><span class="n">arg3</span> <span class="ow">and</span> <span class="nb">all</span> <span class="n">other</span>
           <span class="n">arguments</span> <span class="n">of</span> <span class="n">your</span> <span class="n">training</span> <span class="n">script</span><span class="p">)</span>
</pre></div>
</div>
<ol class="arabic simple" start="2">
<li><p>Multi-Node multi-process distributed training: (e.g. two nodes)</p></li>
</ol>
<p>Node 1: <em>(IP: 192.168.1.1, and has a free port: 1234)</em></p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">python</span> <span class="o">-</span><span class="n">m</span> <span class="n">torch</span><span class="o">.</span><span class="n">distributed</span><span class="o">.</span><span class="n">launch</span> <span class="o">--</span><span class="n">nproc</span><span class="o">-</span><span class="n">per</span><span class="o">-</span><span class="n">node</span><span class="o">=</span><span class="n">NUM_GPUS_YOU_HAVE</span>
           <span class="o">--</span><span class="n">nnodes</span><span class="o">=</span><span class="mi">2</span> <span class="o">--</span><span class="n">node</span><span class="o">-</span><span class="n">rank</span><span class="o">=</span><span class="mi">0</span> <span class="o">--</span><span class="n">master</span><span class="o">-</span><span class="n">addr</span><span class="o">=</span><span class="s2">&quot;192.168.1.1&quot;</span>
           <span class="o">--</span><span class="n">master</span><span class="o">-</span><span class="n">port</span><span class="o">=</span><span class="mi">1234</span> <span class="n">YOUR_TRAINING_SCRIPT</span><span class="o">.</span><span class="n">py</span> <span class="p">(</span><span class="o">--</span><span class="n">arg1</span> <span class="o">--</span><span class="n">arg2</span> <span class="o">--</span><span class="n">arg3</span>
           <span class="ow">and</span> <span class="nb">all</span> <span class="n">other</span> <span class="n">arguments</span> <span class="n">of</span> <span class="n">your</span> <span class="n">training</span> <span class="n">script</span><span class="p">)</span>
</pre></div>
</div>
<p>Node 2:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">python</span> <span class="o">-</span><span class="n">m</span> <span class="n">torch</span><span class="o">.</span><span class="n">distributed</span><span class="o">.</span><span class="n">launch</span> <span class="o">--</span><span class="n">nproc</span><span class="o">-</span><span class="n">per</span><span class="o">-</span><span class="n">node</span><span class="o">=</span><span class="n">NUM_GPUS_YOU_HAVE</span>
           <span class="o">--</span><span class="n">nnodes</span><span class="o">=</span><span class="mi">2</span> <span class="o">--</span><span class="n">node</span><span class="o">-</span><span class="n">rank</span><span class="o">=</span><span class="mi">1</span> <span class="o">--</span><span class="n">master</span><span class="o">-</span><span class="n">addr</span><span class="o">=</span><span class="s2">&quot;192.168.1.1&quot;</span>
           <span class="o">--</span><span class="n">master</span><span class="o">-</span><span class="n">port</span><span class="o">=</span><span class="mi">1234</span> <span class="n">YOUR_TRAINING_SCRIPT</span><span class="o">.</span><span class="n">py</span> <span class="p">(</span><span class="o">--</span><span class="n">arg1</span> <span class="o">--</span><span class="n">arg2</span> <span class="o">--</span><span class="n">arg3</span>
           <span class="ow">and</span> <span class="nb">all</span> <span class="n">other</span> <span class="n">arguments</span> <span class="n">of</span> <span class="n">your</span> <span class="n">training</span> <span class="n">script</span><span class="p">)</span>
</pre></div>
</div>
<ol class="arabic simple" start="3">
<li><p>To look up what optional arguments this module offers:</p></li>
</ol>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">python</span> <span class="o">-</span><span class="n">m</span> <span class="n">torch</span><span class="o">.</span><span class="n">distributed</span><span class="o">.</span><span class="n">launch</span> <span class="o">--</span><span class="n">help</span>
</pre></div>
</div>
<p><strong>Important Notices:</strong></p>
<p>1. This utility and multi-process distributed (single-node or
multi-node) GPU training currently only achieves the best performance using
the NCCL distributed backend. Thus NCCL backend is the recommended backend to
use for GPU training.</p>
<p>2. In your training program, you must parse the command-line argument:
<code class="docutils literal notranslate"><span class="pre">--local-rank=LOCAL_PROCESS_RANK</span></code>, which will be provided by this module.
If your training program uses GPUs, you should ensure that your code only
runs on the GPU device of LOCAL_PROCESS_RANK. This can be done by:</p>
<p>Parsing the local_rank argument</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">argparse</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">parser</span> <span class="o">=</span> <span class="n">argparse</span><span class="o">.</span><span class="n">ArgumentParser</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">parser</span><span class="o">.</span><span class="n">add_argument</span><span class="p">(</span><span class="s2">&quot;--local-rank&quot;</span><span class="p">,</span> <span class="s2">&quot;--local_rank&quot;</span><span class="p">,</span> <span class="nb">type</span><span class="o">=</span><span class="nb">int</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">args</span> <span class="o">=</span> <span class="n">parser</span><span class="o">.</span><span class="n">parse_args</span><span class="p">()</span>
</pre></div>
</div>
<p>Set your device to local rank using either</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">set_device</span><span class="p">(</span><span class="n">args</span><span class="o">.</span><span class="n">local_rank</span><span class="p">)</span>  <span class="c1"># before your code runs</span>
</pre></div>
</div>
<p>or</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">device</span><span class="p">(</span><span class="n">args</span><span class="o">.</span><span class="n">local_rank</span><span class="p">):</span>
<span class="gp">&gt;&gt;&gt; </span>   <span class="c1"># your code to run</span>
<span class="gp">&gt;&gt;&gt; </span>   <span class="o">...</span>
</pre></div>
</div>
<div class="versionchanged">
<p><span class="versionmodified changed">Changed in version 2.0.0: </span>The launcher will passes the <code class="docutils literal notranslate"><span class="pre">--local-rank=&lt;rank&gt;</span></code> argument to your script.
From PyTorch 2.0.0 onwards, the dashed <code class="docutils literal notranslate"><span class="pre">--local-rank</span></code> is preferred over the
previously used underscored <code class="docutils literal notranslate"><span class="pre">--local_rank</span></code>.</p>
<p>For backward compatibility, it may be necessary for users to handle both
cases in their argument parsing code. This means including both <code class="docutils literal notranslate"><span class="pre">&quot;--local-rank&quot;</span></code>
and <code class="docutils literal notranslate"><span class="pre">&quot;--local_rank&quot;</span></code> in the argument parser. If only <code class="docutils literal notranslate"><span class="pre">&quot;--local_rank&quot;</span></code> is
provided, the launcher will trigger an error: “error: unrecognized arguments:
–local-rank=&lt;rank&gt;”. For training code that only supports PyTorch 2.0.0+,
including <code class="docutils literal notranslate"><span class="pre">&quot;--local-rank&quot;</span></code> should be sufficient.</p>
</div>
<p>3. In your training program, you are supposed to call the following function
at the beginning to start the distributed backend. It is strongly recommended
that <code class="docutils literal notranslate"><span class="pre">init_method=env://</span></code>. Other init methods (e.g. <code class="docutils literal notranslate"><span class="pre">tcp://</span></code>) may work,
but <code class="docutils literal notranslate"><span class="pre">env://</span></code> is the one that is officially supported by this module.</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">torch</span><span class="o">.</span><span class="n">distributed</span><span class="o">.</span><span class="n">init_process_group</span><span class="p">(</span><span class="n">backend</span><span class="o">=</span><span class="s1">&#39;YOUR BACKEND&#39;</span><span class="p">,</span>
<span class="gp">&gt;&gt;&gt; </span>                                     <span class="n">init_method</span><span class="o">=</span><span class="s1">&#39;env://&#39;</span><span class="p">)</span>
</pre></div>
</div>
<p>4. In your training program, you can either use regular distributed functions
or use <a class="reference internal" href="generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel" title="torch.nn.parallel.DistributedDataParallel"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.nn.parallel.DistributedDataParallel()</span></code></a> module. If your
training program uses GPUs for training and you would like to use
<a class="reference internal" href="generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel" title="torch.nn.parallel.DistributedDataParallel"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.nn.parallel.DistributedDataParallel()</span></code></a> module,
here is how to configure it.</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">model</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">parallel</span><span class="o">.</span><span class="n">DistributedDataParallel</span><span class="p">(</span><span class="n">model</span><span class="p">,</span>
<span class="gp">&gt;&gt;&gt; </span>                                                  <span class="n">device_ids</span><span class="o">=</span><span class="p">[</span><span class="n">args</span><span class="o">.</span><span class="n">local_rank</span><span class="p">],</span>
<span class="gp">&gt;&gt;&gt; </span>                                                  <span class="n">output_device</span><span class="o">=</span><span class="n">args</span><span class="o">.</span><span class="n">local_rank</span><span class="p">)</span>
</pre></div>
</div>
<p>Please ensure that <code class="docutils literal notranslate"><span class="pre">device_ids</span></code> argument is set to be the only GPU device id
that your code will be operating on. This is generally the local rank of the
process. In other words, the <code class="docutils literal notranslate"><span class="pre">device_ids</span></code> needs to be <code class="docutils literal notranslate"><span class="pre">[args.local_rank]</span></code>,
and <code class="docutils literal notranslate"><span class="pre">output_device</span></code> needs to be <code class="docutils literal notranslate"><span class="pre">args.local_rank</span></code> in order to use this
utility</p>
<p>5. Another way to pass <code class="docutils literal notranslate"><span class="pre">local_rank</span></code> to the subprocesses via environment variable
<code class="docutils literal notranslate"><span class="pre">LOCAL_RANK</span></code>. This behavior is enabled when you launch the script with
<code class="docutils literal notranslate"><span class="pre">--use-env=True</span></code>. You must adjust the subprocess example above to replace
<code class="docutils literal notranslate"><span class="pre">args.local_rank</span></code> with <code class="docutils literal notranslate"><span class="pre">os.environ['LOCAL_RANK']</span></code>; the launcher
will not pass <code class="docutils literal notranslate"><span class="pre">--local-rank</span></code> when you specify this flag.</p>
<div class="admonition warning">
<p class="admonition-title">Warning</p>
<p><code class="docutils literal notranslate"><span class="pre">local_rank</span></code> is NOT globally unique: it is only unique per process
on a machine.  Thus, don’t use it to decide if you should, e.g.,
write to a networked filesystem.  See
<a class="reference external" href="https://github.com/pytorch/pytorch/issues/12042">https://github.com/pytorch/pytorch/issues/12042</a> for an example of
how things can go wrong if you don’t do this correctly.</p>
</div>
</div>
<div class="section" id="spawn-utility">
<h2>Spawn utility<a class="headerlink" href="#spawn-utility" title="Permalink to this heading">¶</a></h2>
<p>The <a class="reference internal" href="multiprocessing.html#multiprocessing-doc"><span class="std std-ref">Multiprocessing package - torch.multiprocessing</span></a> package also provides a <code class="docutils literal notranslate"><span class="pre">spawn</span></code>
function in <a class="reference internal" href="multiprocessing.html#module-torch.multiprocessing.spawn" title="torch.multiprocessing.spawn"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.multiprocessing.spawn()</span></code></a>. This helper function
can be used to spawn multiple processes. It works by passing in the
function that you want to run and spawns N processes to run it. This
can be used for multiprocess distributed training as well.</p>
<p>For references on how to use it, please refer to <a class="reference external" href="https://github.com/pytorch/examples/tree/master/imagenet">PyTorch example - ImageNet
implementation</a></p>
<p>Note that this function requires Python 3.4 or higher.</p>
</div>
<div class="section" id="debugging-torch-distributed-applications">
<h2>Debugging <code class="docutils literal notranslate"><span class="pre">torch.distributed</span></code> applications<a class="headerlink" href="#debugging-torch-distributed-applications" title="Permalink to this heading">¶</a></h2>
<p>Debugging distributed applications can be challenging due to hard to understand hangs, crashes, or inconsistent behavior across ranks. <code class="docutils literal notranslate"><span class="pre">torch.distributed</span></code> provides
a suite of tools to help debug training applications in a self-serve fashion:</p>
<div class="section" id="python-breakpoint">
<h3>Python Breakpoint<a class="headerlink" href="#python-breakpoint" title="Permalink to this heading">¶</a></h3>
<p>It is extremely convenient to use python’s debugger in a distributed environment, but because it does not work out of the box many people do not use it at all.
PyTorch offers a customized wrapper around pdb that streamlines the process.</p>
<p><cite>torch.distributed.breakpoint</cite> makes this process easy.  Internally, it customizes <cite>pdb</cite>’s breakpoint behavior in two ways but otherwise behaves as normal <cite>pdb</cite>.
1. Attaches the debugger only on one rank (specified by the user).
2. Ensures all other ranks stop, by using a <cite>torch.distributed.barrier()</cite> that will release once the debugged rank issues a <cite>continue</cite>
3. Reroutes stdin from the child process such that it connects to your terminal.</p>
<p>To use it, simply issue <cite>torch.distributed.breakpoint(rank)</cite> on all ranks, using the same value for <cite>rank</cite> in each case.</p>
</div>
<div class="section" id="monitored-barrier">
<h3>Monitored Barrier<a class="headerlink" href="#monitored-barrier" title="Permalink to this heading">¶</a></h3>
<p>As of v1.10, <a class="reference internal" href="#torch.distributed.monitored_barrier" title="torch.distributed.monitored_barrier"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.distributed.monitored_barrier()</span></code></a> exists as an alternative to <a class="reference internal" href="#torch.distributed.barrier" title="torch.distributed.barrier"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.distributed.barrier()</span></code></a> which fails with helpful information about which rank may be faulty
when crashing, i.e. not all ranks calling into <a class="reference internal" href="#torch.distributed.monitored_barrier" title="torch.distributed.monitored_barrier"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.distributed.monitored_barrier()</span></code></a> within the provided timeout. <a class="reference internal" href="#torch.distributed.monitored_barrier" title="torch.distributed.monitored_barrier"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.distributed.monitored_barrier()</span></code></a> implements a host-side
barrier using <code class="docutils literal notranslate"><span class="pre">send</span></code>/<code class="docutils literal notranslate"><span class="pre">recv</span></code> communication primitives in a process similar to acknowledgements, allowing rank 0 to report which rank(s) failed to acknowledge
the barrier in time. As an example, consider the following function where rank 1 fails to call into <a class="reference internal" href="#torch.distributed.monitored_barrier" title="torch.distributed.monitored_barrier"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.distributed.monitored_barrier()</span></code></a> (in practice this could be due
to an application bug or hang in a previous collective):</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">os</span>
<span class="kn">from</span> <span class="nn">datetime</span> <span class="kn">import</span> <span class="n">timedelta</span>

<span class="kn">import</span> <span class="nn">torch</span>
<span class="kn">import</span> <span class="nn">torch.distributed</span> <span class="k">as</span> <span class="nn">dist</span>
<span class="kn">import</span> <span class="nn">torch.multiprocessing</span> <span class="k">as</span> <span class="nn">mp</span>


<span class="k">def</span> <span class="nf">worker</span><span class="p">(</span><span class="n">rank</span><span class="p">):</span>
    <span class="n">dist</span><span class="o">.</span><span class="n">init_process_group</span><span class="p">(</span><span class="s2">&quot;nccl&quot;</span><span class="p">,</span> <span class="n">rank</span><span class="o">=</span><span class="n">rank</span><span class="p">,</span> <span class="n">world_size</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
    <span class="c1"># monitored barrier requires gloo process group to perform host-side sync.</span>
    <span class="n">group_gloo</span> <span class="o">=</span> <span class="n">dist</span><span class="o">.</span><span class="n">new_group</span><span class="p">(</span><span class="n">backend</span><span class="o">=</span><span class="s2">&quot;gloo&quot;</span><span class="p">)</span>
    <span class="k">if</span> <span class="n">rank</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="mi">1</span><span class="p">]:</span>
        <span class="n">dist</span><span class="o">.</span><span class="n">monitored_barrier</span><span class="p">(</span><span class="n">group</span><span class="o">=</span><span class="n">group_gloo</span><span class="p">,</span> <span class="n">timeout</span><span class="o">=</span><span class="n">timedelta</span><span class="p">(</span><span class="n">seconds</span><span class="o">=</span><span class="mi">2</span><span class="p">))</span>


<span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">&quot;__main__&quot;</span><span class="p">:</span>
    <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;MASTER_ADDR&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;localhost&quot;</span>
    <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;MASTER_PORT&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;29501&quot;</span>
    <span class="n">mp</span><span class="o">.</span><span class="n">spawn</span><span class="p">(</span><span class="n">worker</span><span class="p">,</span> <span class="n">nprocs</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">args</span><span class="o">=</span><span class="p">())</span>
</pre></div>
</div>
<p>The following error message is produced on rank 0, allowing the user to determine which rank(s) may be faulty and investigate further:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="ne">RuntimeError</span><span class="p">:</span> <span class="n">Rank</span> <span class="mi">1</span> <span class="n">failed</span> <span class="n">to</span> <span class="k">pass</span> <span class="n">monitoredBarrier</span> <span class="ow">in</span> <span class="mi">2000</span> <span class="n">ms</span>
 <span class="n">Original</span> <span class="n">exception</span><span class="p">:</span>
<span class="p">[</span><span class="n">gloo</span><span class="o">/</span><span class="n">transport</span><span class="o">/</span><span class="n">tcp</span><span class="o">/</span><span class="n">pair</span><span class="o">.</span><span class="n">cc</span><span class="p">:</span><span class="mi">598</span><span class="p">]</span> <span class="n">Connection</span> <span class="n">closed</span> <span class="n">by</span> <span class="n">peer</span> <span class="p">[</span><span class="mi">2401</span><span class="p">:</span><span class="n">db00</span><span class="p">:</span><span class="n">eef0</span><span class="p">:</span><span class="mi">1100</span><span class="p">:</span><span class="mi">3560</span><span class="p">:</span><span class="mi">0</span><span class="p">:</span><span class="mi">1</span><span class="n">c05</span><span class="p">:</span><span class="mi">25</span><span class="n">d</span><span class="p">]:</span><span class="mi">8594</span>
</pre></div>
</div>
</div>
<div class="section" id="torch-distributed-debug">
<h3><code class="docutils literal notranslate"><span class="pre">TORCH_DISTRIBUTED_DEBUG</span></code><a class="headerlink" href="#torch-distributed-debug" title="Permalink to this heading">¶</a></h3>
<p>With <code class="docutils literal notranslate"><span class="pre">TORCH_CPP_LOG_LEVEL=INFO</span></code>, the environment variable <code class="docutils literal notranslate"><span class="pre">TORCH_DISTRIBUTED_DEBUG</span></code>  can be used to trigger additional useful logging and collective synchronization checks to ensure all ranks
are synchronized appropriately. <code class="docutils literal notranslate"><span class="pre">TORCH_DISTRIBUTED_DEBUG</span></code> can be set to either <code class="docutils literal notranslate"><span class="pre">OFF</span></code> (default), <code class="docutils literal notranslate"><span class="pre">INFO</span></code>, or <code class="docutils literal notranslate"><span class="pre">DETAIL</span></code> depending on the debugging level
required. Please note that the most verbose option, <code class="docutils literal notranslate"><span class="pre">DETAIL</span></code> may impact the application performance and thus should only be used when debugging issues.</p>
<p>Setting <code class="docutils literal notranslate"><span class="pre">TORCH_DISTRIBUTED_DEBUG=INFO</span></code> will result in additional debug logging when models trained with <a class="reference internal" href="generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel" title="torch.nn.parallel.DistributedDataParallel"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.nn.parallel.DistributedDataParallel()</span></code></a> are initialized, and
<code class="docutils literal notranslate"><span class="pre">TORCH_DISTRIBUTED_DEBUG=DETAIL</span></code> will additionally log runtime performance statistics a select number of iterations. These runtime statistics
include data such as forward time, backward time, gradient communication time, etc. As an example, given the following application:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">os</span>

<span class="kn">import</span> <span class="nn">torch</span>
<span class="kn">import</span> <span class="nn">torch.distributed</span> <span class="k">as</span> <span class="nn">dist</span>
<span class="kn">import</span> <span class="nn">torch.multiprocessing</span> <span class="k">as</span> <span class="nn">mp</span>


<span class="k">class</span> <span class="nc">TwoLinLayerNet</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span>
    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">a</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Linear</span><span class="p">(</span><span class="mi">10</span><span class="p">,</span> <span class="mi">10</span><span class="p">,</span> <span class="n">bias</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">b</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">Linear</span><span class="p">(</span><span class="mi">10</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="n">bias</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>

    <span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">):</span>
        <span class="n">a</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">a</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
        <span class="n">b</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">b</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
        <span class="k">return</span> <span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">)</span>


<span class="k">def</span> <span class="nf">worker</span><span class="p">(</span><span class="n">rank</span><span class="p">):</span>
    <span class="n">dist</span><span class="o">.</span><span class="n">init_process_group</span><span class="p">(</span><span class="s2">&quot;nccl&quot;</span><span class="p">,</span> <span class="n">rank</span><span class="o">=</span><span class="n">rank</span><span class="p">,</span> <span class="n">world_size</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
    <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">set_device</span><span class="p">(</span><span class="n">rank</span><span class="p">)</span>
    <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;init model&quot;</span><span class="p">)</span>
    <span class="n">model</span> <span class="o">=</span> <span class="n">TwoLinLayerNet</span><span class="p">()</span><span class="o">.</span><span class="n">cuda</span><span class="p">()</span>
    <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;init ddp&quot;</span><span class="p">)</span>
    <span class="n">ddp_model</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">parallel</span><span class="o">.</span><span class="n">DistributedDataParallel</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">device_ids</span><span class="o">=</span><span class="p">[</span><span class="n">rank</span><span class="p">])</span>

    <span class="n">inp</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">10</span><span class="p">,</span> <span class="mi">10</span><span class="p">)</span><span class="o">.</span><span class="n">cuda</span><span class="p">()</span>
    <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;train&quot;</span><span class="p">)</span>

    <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">20</span><span class="p">):</span>
        <span class="n">output</span> <span class="o">=</span> <span class="n">ddp_model</span><span class="p">(</span><span class="n">inp</span><span class="p">)</span>
        <span class="n">loss</span> <span class="o">=</span> <span class="n">output</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">+</span> <span class="n">output</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
        <span class="n">loss</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">backward</span><span class="p">()</span>


<span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">&quot;__main__&quot;</span><span class="p">:</span>
    <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;MASTER_ADDR&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;localhost&quot;</span>
    <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;MASTER_PORT&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;29501&quot;</span>
    <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;TORCH_CPP_LOG_LEVEL&quot;</span><span class="p">]</span><span class="o">=</span><span class="s2">&quot;INFO&quot;</span>
    <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span>
        <span class="s2">&quot;TORCH_DISTRIBUTED_DEBUG&quot;</span>
    <span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;DETAIL&quot;</span>  <span class="c1"># set to DETAIL for runtime logging.</span>
    <span class="n">mp</span><span class="o">.</span><span class="n">spawn</span><span class="p">(</span><span class="n">worker</span><span class="p">,</span> <span class="n">nprocs</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">args</span><span class="o">=</span><span class="p">())</span>
</pre></div>
</div>
<p>The following logs are rendered at initialization time:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">I0607</span> <span class="mi">16</span><span class="p">:</span><span class="mi">10</span><span class="p">:</span><span class="mf">35.739390</span> <span class="mi">515217</span> <span class="n">logger</span><span class="o">.</span><span class="n">cpp</span><span class="p">:</span><span class="mi">173</span><span class="p">]</span> <span class="p">[</span><span class="n">Rank</span> <span class="mi">0</span><span class="p">]:</span> <span class="n">DDP</span> <span class="n">Initialized</span> <span class="k">with</span><span class="p">:</span>
<span class="n">broadcast_buffers</span><span class="p">:</span> <span class="mi">1</span>
<span class="n">bucket_cap_bytes</span><span class="p">:</span> <span class="mi">26214400</span>
<span class="n">find_unused_parameters</span><span class="p">:</span> <span class="mi">0</span>
<span class="n">gradient_as_bucket_view</span><span class="p">:</span> <span class="mi">0</span>
<span class="n">is_multi_device_module</span><span class="p">:</span> <span class="mi">0</span>
<span class="n">iteration</span><span class="p">:</span> <span class="mi">0</span>
<span class="n">num_parameter_tensors</span><span class="p">:</span> <span class="mi">2</span>
<span class="n">output_device</span><span class="p">:</span> <span class="mi">0</span>
<span class="n">rank</span><span class="p">:</span> <span class="mi">0</span>
<span class="n">total_parameter_size_bytes</span><span class="p">:</span> <span class="mi">440</span>
<span class="n">world_size</span><span class="p">:</span> <span class="mi">2</span>
<span class="n">backend_name</span><span class="p">:</span> <span class="n">nccl</span>
<span class="n">bucket_sizes</span><span class="p">:</span> <span class="mi">440</span>
<span class="n">cuda_visible_devices</span><span class="p">:</span> <span class="n">N</span><span class="o">/</span><span class="n">A</span>
<span class="n">device_ids</span><span class="p">:</span> <span class="mi">0</span>
<span class="n">dtypes</span><span class="p">:</span> <span class="nb">float</span>
<span class="n">master_addr</span><span class="p">:</span> <span class="n">localhost</span>
<span class="n">master_port</span><span class="p">:</span> <span class="mi">29501</span>
<span class="n">module_name</span><span class="p">:</span> <span class="n">TwoLinLayerNet</span>
<span class="n">nccl_async_error_handling</span><span class="p">:</span> <span class="n">N</span><span class="o">/</span><span class="n">A</span>
<span class="n">nccl_blocking_wait</span><span class="p">:</span> <span class="n">N</span><span class="o">/</span><span class="n">A</span>
<span class="n">nccl_debug</span><span class="p">:</span> <span class="n">WARN</span>
<span class="n">nccl_ib_timeout</span><span class="p">:</span> <span class="n">N</span><span class="o">/</span><span class="n">A</span>
<span class="n">nccl_nthreads</span><span class="p">:</span> <span class="n">N</span><span class="o">/</span><span class="n">A</span>
<span class="n">nccl_socket_ifname</span><span class="p">:</span> <span class="n">N</span><span class="o">/</span><span class="n">A</span>
<span class="n">torch_distributed_debug</span><span class="p">:</span> <span class="n">INFO</span>
</pre></div>
</div>
<p>The following logs are rendered during runtime (when <code class="docutils literal notranslate"><span class="pre">TORCH_DISTRIBUTED_DEBUG=DETAIL</span></code> is set):</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">I0607</span> <span class="mi">16</span><span class="p">:</span><span class="mi">18</span><span class="p">:</span><span class="mf">58.085681</span> <span class="mi">544067</span> <span class="n">logger</span><span class="o">.</span><span class="n">cpp</span><span class="p">:</span><span class="mi">344</span><span class="p">]</span> <span class="p">[</span><span class="n">Rank</span> <span class="mi">1</span> <span class="o">/</span> <span class="mi">2</span><span class="p">]</span> <span class="n">Training</span> <span class="n">TwoLinLayerNet</span> <span class="n">unused_parameter_size</span><span class="o">=</span><span class="mi">0</span>
 <span class="n">Avg</span> <span class="n">forward</span> <span class="n">compute</span> <span class="n">time</span><span class="p">:</span> <span class="mi">40838608</span>
 <span class="n">Avg</span> <span class="n">backward</span> <span class="n">compute</span> <span class="n">time</span><span class="p">:</span> <span class="mi">5983335</span>
<span class="n">Avg</span> <span class="n">backward</span> <span class="n">comm</span><span class="o">.</span> <span class="n">time</span><span class="p">:</span> <span class="mi">4326421</span>
 <span class="n">Avg</span> <span class="n">backward</span> <span class="n">comm</span><span class="o">/</span><span class="n">comp</span> <span class="n">overlap</span> <span class="n">time</span><span class="p">:</span> <span class="mi">4207652</span>
<span class="n">I0607</span> <span class="mi">16</span><span class="p">:</span><span class="mi">18</span><span class="p">:</span><span class="mf">58.085693</span> <span class="mi">544066</span> <span class="n">logger</span><span class="o">.</span><span class="n">cpp</span><span class="p">:</span><span class="mi">344</span><span class="p">]</span> <span class="p">[</span><span class="n">Rank</span> <span class="mi">0</span> <span class="o">/</span> <span class="mi">2</span><span class="p">]</span> <span class="n">Training</span> <span class="n">TwoLinLayerNet</span> <span class="n">unused_parameter_size</span><span class="o">=</span><span class="mi">0</span>
 <span class="n">Avg</span> <span class="n">forward</span> <span class="n">compute</span> <span class="n">time</span><span class="p">:</span> <span class="mi">42850427</span>
 <span class="n">Avg</span> <span class="n">backward</span> <span class="n">compute</span> <span class="n">time</span><span class="p">:</span> <span class="mi">3885553</span>
<span class="n">Avg</span> <span class="n">backward</span> <span class="n">comm</span><span class="o">.</span> <span class="n">time</span><span class="p">:</span> <span class="mi">2357981</span>
 <span class="n">Avg</span> <span class="n">backward</span> <span class="n">comm</span><span class="o">/</span><span class="n">comp</span> <span class="n">overlap</span> <span class="n">time</span><span class="p">:</span> <span class="mi">2234674</span>
</pre></div>
</div>
<p>In addition, <code class="docutils literal notranslate"><span class="pre">TORCH_DISTRIBUTED_DEBUG=INFO</span></code> enhances crash logging in <a class="reference internal" href="generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel" title="torch.nn.parallel.DistributedDataParallel"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.nn.parallel.DistributedDataParallel()</span></code></a> due to unused parameters in the model. Currently, <code class="docutils literal notranslate"><span class="pre">find_unused_parameters=True</span></code>
must be passed into <a class="reference internal" href="generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel" title="torch.nn.parallel.DistributedDataParallel"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.nn.parallel.DistributedDataParallel()</span></code></a> initialization if there are parameters that may be unused in the forward pass, and as of v1.10, all model outputs are required
to be used in loss computation as <a class="reference internal" href="generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel" title="torch.nn.parallel.DistributedDataParallel"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.nn.parallel.DistributedDataParallel()</span></code></a> does not support unused parameters in the backwards pass. These constraints are challenging especially for larger
models, thus when crashing with an error, <a class="reference internal" href="generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel" title="torch.nn.parallel.DistributedDataParallel"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.nn.parallel.DistributedDataParallel()</span></code></a> will log the fully qualified name of all parameters that went unused. For example, in the above application,
if we modify <code class="docutils literal notranslate"><span class="pre">loss</span></code> to be instead computed as <code class="docutils literal notranslate"><span class="pre">loss</span> <span class="pre">=</span> <span class="pre">output[1]</span></code>, then <code class="docutils literal notranslate"><span class="pre">TwoLinLayerNet.a</span></code> does not receive a gradient in the backwards pass, and
thus results in <code class="docutils literal notranslate"><span class="pre">DDP</span></code> failing. On a crash, the user is passed information about parameters which went unused, which may be challenging to manually find for large models:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one. This error indicates that your module has parameters that were not used in producing loss. You can enable unused parameter detection by passing
 the keyword argument `find_unused_parameters=True` to `torch.nn.parallel.DistributedDataParallel`, and by
making sure all `forward` function outputs participate in calculating loss.
If you already have done the above, then the distributed data parallel module wasn&#39;t able to locate the output tensors in the return value of your module&#39;s `forward` function. Please include the loss function and the structure of the return va
lue of `forward` of your module when reporting this issue (e.g. list, dict, iterable).
Parameters which did not receive grad for rank 0: a.weight
Parameter indices which did not receive grad for rank 0: 0
</pre></div>
</div>
<p>Setting <code class="docutils literal notranslate"><span class="pre">TORCH_DISTRIBUTED_DEBUG=DETAIL</span></code> will trigger additional consistency and synchronization checks on every collective call issued by the user
either directly or indirectly (such as DDP <code class="docutils literal notranslate"><span class="pre">allreduce</span></code>). This is done by creating a wrapper process group that wraps all process groups returned by
<a class="reference internal" href="#torch.distributed.init_process_group" title="torch.distributed.init_process_group"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.distributed.init_process_group()</span></code></a> and <a class="reference internal" href="#torch.distributed.new_group" title="torch.distributed.new_group"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.distributed.new_group()</span></code></a> APIs. As a result, these APIs will return a wrapper process group that can be used exactly like a regular process
group, but performs consistency checks before dispatching the collective to an underlying process group. Currently, these checks include a <a class="reference internal" href="#torch.distributed.monitored_barrier" title="torch.distributed.monitored_barrier"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.distributed.monitored_barrier()</span></code></a>,
which ensures all ranks complete their outstanding collective calls and reports ranks which are stuck. Next, the collective itself is checked for consistency by
ensuring all collective functions match and are called with consistent tensor shapes. If this is not the case, a detailed error report is included when the
application crashes, rather than a hang or uninformative error message. As an example, consider the following function which has mismatched input shapes into
<a class="reference internal" href="#torch.distributed.all_reduce" title="torch.distributed.all_reduce"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.distributed.all_reduce()</span></code></a>:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">torch</span>
<span class="kn">import</span> <span class="nn">torch.distributed</span> <span class="k">as</span> <span class="nn">dist</span>
<span class="kn">import</span> <span class="nn">torch.multiprocessing</span> <span class="k">as</span> <span class="nn">mp</span>


<span class="k">def</span> <span class="nf">worker</span><span class="p">(</span><span class="n">rank</span><span class="p">):</span>
    <span class="n">dist</span><span class="o">.</span><span class="n">init_process_group</span><span class="p">(</span><span class="s2">&quot;nccl&quot;</span><span class="p">,</span> <span class="n">rank</span><span class="o">=</span><span class="n">rank</span><span class="p">,</span> <span class="n">world_size</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
    <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">set_device</span><span class="p">(</span><span class="n">rank</span><span class="p">)</span>
    <span class="n">tensor</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">10</span> <span class="k">if</span> <span class="n">rank</span> <span class="o">==</span> <span class="mi">0</span> <span class="k">else</span> <span class="mi">20</span><span class="p">)</span><span class="o">.</span><span class="n">cuda</span><span class="p">()</span>
    <span class="n">dist</span><span class="o">.</span><span class="n">all_reduce</span><span class="p">(</span><span class="n">tensor</span><span class="p">)</span>
    <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">synchronize</span><span class="p">(</span><span class="n">device</span><span class="o">=</span><span class="n">rank</span><span class="p">)</span>


<span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">&quot;__main__&quot;</span><span class="p">:</span>
    <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;MASTER_ADDR&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;localhost&quot;</span>
    <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;MASTER_PORT&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;29501&quot;</span>
    <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;TORCH_CPP_LOG_LEVEL&quot;</span><span class="p">]</span><span class="o">=</span><span class="s2">&quot;INFO&quot;</span>
    <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;TORCH_DISTRIBUTED_DEBUG&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;DETAIL&quot;</span>
    <span class="n">mp</span><span class="o">.</span><span class="n">spawn</span><span class="p">(</span><span class="n">worker</span><span class="p">,</span> <span class="n">nprocs</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">args</span><span class="o">=</span><span class="p">())</span>
</pre></div>
</div>
<p>With the <code class="docutils literal notranslate"><span class="pre">NCCL</span></code> backend, such an application would likely result in a hang which can be challenging to root-cause in nontrivial scenarios. If the user enables
<code class="docutils literal notranslate"><span class="pre">TORCH_DISTRIBUTED_DEBUG=DETAIL</span></code> and reruns the application, the following error message reveals the root cause:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">work</span> <span class="o">=</span> <span class="n">default_pg</span><span class="o">.</span><span class="n">allreduce</span><span class="p">([</span><span class="n">tensor</span><span class="p">],</span> <span class="n">opts</span><span class="p">)</span>
<span class="ne">RuntimeError</span><span class="p">:</span> <span class="n">Error</span> <span class="n">when</span> <span class="n">verifying</span> <span class="n">shape</span> <span class="n">tensors</span> <span class="k">for</span> <span class="n">collective</span> <span class="n">ALLREDUCE</span> <span class="n">on</span> <span class="n">rank</span> <span class="mf">0.</span> <span class="n">This</span> <span class="n">likely</span> <span class="n">indicates</span> <span class="n">that</span> <span class="nb">input</span> <span class="n">shapes</span> <span class="n">into</span> <span class="n">the</span> <span class="n">collective</span> <span class="n">are</span> <span class="n">mismatched</span> <span class="n">across</span> <span class="n">ranks</span><span class="o">.</span> <span class="n">Got</span> <span class="n">shapes</span><span class="p">:</span>  <span class="mi">10</span>
<span class="p">[</span> <span class="n">torch</span><span class="o">.</span><span class="n">LongTensor</span><span class="p">{</span><span class="mi">1</span><span class="p">}</span> <span class="p">]</span>
</pre></div>
</div>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>For fine-grained control of the debug level during runtime the functions <code class="xref py py-func docutils literal notranslate"><span class="pre">torch.distributed.set_debug_level()</span></code>, <code class="xref py py-func docutils literal notranslate"><span class="pre">torch.distributed.set_debug_level_from_env()</span></code>, and
<code class="xref py py-func docutils literal notranslate"><span class="pre">torch.distributed.get_debug_level()</span></code> can also be used.</p>
</div>
<p>In addition, <cite>TORCH_DISTRIBUTED_DEBUG=DETAIL</cite> can be used in conjunction with <cite>TORCH_SHOW_CPP_STACKTRACES=1</cite> to log the entire callstack when a collective desynchronization is detected. These
collective desynchronization checks will work for all applications that use <code class="docutils literal notranslate"><span class="pre">c10d</span></code> collective calls backed by process groups created with the
<a class="reference internal" href="#torch.distributed.init_process_group" title="torch.distributed.init_process_group"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.distributed.init_process_group()</span></code></a> and <a class="reference internal" href="#torch.distributed.new_group" title="torch.distributed.new_group"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.distributed.new_group()</span></code></a> APIs.</p>
</div>
</div>
<div class="section" id="logging">
<h2>Logging<a class="headerlink" href="#logging" title="Permalink to this heading">¶</a></h2>
<p>In addition to explicit debugging support via <a class="reference internal" href="#torch.distributed.monitored_barrier" title="torch.distributed.monitored_barrier"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.distributed.monitored_barrier()</span></code></a> and <code class="docutils literal notranslate"><span class="pre">TORCH_DISTRIBUTED_DEBUG</span></code>, the underlying C++ library of <code class="docutils literal notranslate"><span class="pre">torch.distributed</span></code> also outputs log
messages at various levels. These messages can be helpful to understand the execution state of a distributed training job and to troubleshoot problems such as network connection failures. The
following matrix shows how the log level can be adjusted via the combination of <code class="docutils literal notranslate"><span class="pre">TORCH_CPP_LOG_LEVEL</span></code> and <code class="docutils literal notranslate"><span class="pre">TORCH_DISTRIBUTED_DEBUG</span></code> environment variables.</p>
<table class="docutils colwidths-auto align-default">
<thead>
<tr class="row-odd"><th class="head"><p><code class="docutils literal notranslate"><span class="pre">TORCH_CPP_LOG_LEVEL</span></code></p></th>
<th class="head"><p><code class="docutils literal notranslate"><span class="pre">TORCH_DISTRIBUTED_DEBUG</span></code></p></th>
<th class="head"><p>Effective Log Level</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">ERROR</span></code></p></td>
<td><p>ignored</p></td>
<td><p>Error</p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">WARNING</span></code></p></td>
<td><p>ignored</p></td>
<td><p>Warning</p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">INFO</span></code></p></td>
<td><p>ignored</p></td>
<td><p>Info</p></td>
</tr>
<tr class="row-odd"><td><p><code class="docutils literal notranslate"><span class="pre">INFO</span></code></p></td>
<td><p><code class="docutils literal notranslate"><span class="pre">INFO</span></code></p></td>
<td><p>Debug</p></td>
</tr>
<tr class="row-even"><td><p><code class="docutils literal notranslate"><span class="pre">INFO</span></code></p></td>
<td><p><code class="docutils literal notranslate"><span class="pre">DETAIL</span></code></p></td>
<td><p>Trace (a.k.a. All)</p></td>
</tr>
</tbody>
</table>
<p>Distributed components raise custom Exception types derived from <cite>RuntimeError</cite>:</p>
<ul class="simple">
<li><p><cite>torch.distributed.DistError</cite>: This is the base type of all distributed exceptions.</p></li>
<li><p><cite>torch.distributed.DistBackendError</cite>: This exception is thrown when a backend-specific error occurs. For example, if
the <cite>NCCL</cite> backend is used and the user attempts to use a GPU that is not available to the <cite>NCCL</cite> library.</p></li>
<li><p><cite>torch.distributed.DistNetworkError</cite>: This exception is thrown when networking
libraries encounter errors (ex: Connection reset by peer)</p></li>
<li><p><cite>torch.distributed.DistStoreError</cite>: This exception is thrown when the Store encounters
an error (ex: TCPStore timeout)</p></li>
</ul>
<dl class="py class">
<dt class="sig sig-object py" id="torch.distributed.DistError">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">DistError</span></span><a class="headerlink" href="#torch.distributed.DistError" title="Permalink to this definition">¶</a></dt>
<dd><p>Exception raised when an error occurs in the distributed library</p>
</dd></dl>

<dl class="py class">
<dt class="sig sig-object py" id="torch.distributed.DistBackendError">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">DistBackendError</span></span><a class="headerlink" href="#torch.distributed.DistBackendError" title="Permalink to this definition">¶</a></dt>
<dd><p>Exception raised when a backend error occurs in distributed</p>
</dd></dl>

<dl class="py class">
<dt class="sig sig-object py" id="torch.distributed.DistNetworkError">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">DistNetworkError</span></span><a class="headerlink" href="#torch.distributed.DistNetworkError" title="Permalink to this definition">¶</a></dt>
<dd><p>Exception raised when a network error occurs in distributed</p>
</dd></dl>

<dl class="py class">
<dt class="sig sig-object py" id="torch.distributed.DistStoreError">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">DistStoreError</span></span><a class="headerlink" href="#torch.distributed.DistStoreError" title="Permalink to this definition">¶</a></dt>
<dd><p>Exception raised when an error occurs in the distributed store</p>
</dd></dl>

<p>If you are running single node training, it may be convenient to interactively breakpoint your script.  We offer a way to conveniently breakpoint a single rank:</p>
<dl class="py function">
<dt class="sig sig-object py" id="torch.distributed.breakpoint">
<span class="sig-prename descclassname"><span class="pre">torch.distributed.</span></span><span class="sig-name descname"><span class="pre">breakpoint</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">rank</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">skip</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/torch/distributed.html#breakpoint"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#torch.distributed.breakpoint" title="Permalink to this definition">¶</a></dt>
<dd><p>Set a breakpoint, but only on a single rank.  All other ranks will wait for you to be
done with the breakpoint before continuing.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>rank</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a>) – Which rank to break on.  Default: <code class="docutils literal notranslate"><span class="pre">0</span></code></p></li>
<li><p><strong>skip</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.13)"><em>int</em></a>) – Skip the first <code class="docutils literal notranslate"><span class="pre">skip</span></code> calls to this breakpoint. Default: <code class="docutils literal notranslate"><span class="pre">0</span></code>.</p></li>
</ul>
</dd>
</dl>
</dd></dl>

<span class="target" id="module-torch.distributed.algorithms"></span><span class="target" id="module-torch.distributed.algorithms.ddp_comm_hooks"></span><span class="target" id="module-torch.distributed.algorithms.model_averaging"></span><span class="target" id="module-torch.distributed.elastic"></span><span class="target" id="module-torch.distributed.elastic.utils"></span><span class="target" id="module-torch.distributed.elastic.utils.data"></span><span class="target" id="module-torch.distributed.launcher"></span><span class="target" id="module-torch.distributed.nn"></span><span class="target" id="module-torch.distributed.nn.api"></span><span class="target" id="module-torch.distributed.nn.jit"></span><span class="target" id="module-torch.distributed.nn.jit.templates"></span><span class="target" id="module-torch.distributed.algorithms.ddp_comm_hooks.ddp_zero_hook"></span><span class="target" id="module-torch.distributed.algorithms.ddp_comm_hooks.debugging_hooks"></span><span class="target" id="module-torch.distributed.algorithms.ddp_comm_hooks.default_hooks"></span><span class="target" id="module-torch.distributed.algorithms.ddp_comm_hooks.mixed_precision_hooks"></span><span class="target" id="module-torch.distributed.algorithms.ddp_comm_hooks.optimizer_overlap_hooks"></span><span class="target" id="module-torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook"></span><span class="target" id="module-torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook"></span><span class="target" id="module-torch.distributed.algorithms.ddp_comm_hooks.quantization_hooks"></span><span class="target" id="module-torch.distributed.algorithms.join"></span><span class="target" id="module-torch.distributed.algorithms.model_averaging.averagers"></span><span class="target" id="module-torch.distributed.algorithms.model_averaging.hierarchical_model_averager"></span><span class="target" id="module-torch.distributed.algorithms.model_averaging.utils"></span><span class="target" id="module-torch.distributed.argparse_util"></span><span class="target" id="module-torch.distributed.c10d_logger"></span><span class="target" id="module-torch.distributed.checkpoint.api"></span><span class="target" id="module-torch.distributed.checkpoint.default_planner"></span><span class="target" id="module-torch.distributed.checkpoint.filesystem"></span><span class="target" id="module-torch.distributed.checkpoint.metadata"></span><span class="target" id="module-torch.distributed.checkpoint.optimizer"></span><span class="target" id="module-torch.distributed.checkpoint.planner"></span><span class="target" id="module-torch.distributed.checkpoint.planner_helpers"></span><span class="target" id="module-torch.distributed.checkpoint.resharding"></span><span class="target" id="module-torch.distributed.checkpoint.state_dict_loader"></span><span class="target" id="module-torch.distributed.checkpoint.state_dict_saver"></span><span class="target" id="module-torch.distributed.checkpoint.stateful"></span><span class="target" id="module-torch.distributed.checkpoint.storage"></span><span class="target" id="module-torch.distributed.checkpoint.utils"></span><span class="target" id="module-torch.distributed.collective_utils"></span><span class="target" id="module-torch.distributed.constants"></span><span class="target" id="module-torch.distributed.device_mesh"></span><span class="target" id="module-torch.distributed.distributed_c10d"></span><span class="target" id="module-torch.distributed.elastic.agent.server.api"></span><span class="target" id="module-torch.distributed.elastic.agent.server.local_elastic_agent"></span><span class="target" id="module-torch.distributed.elastic.events.api"></span><span class="target" id="module-torch.distributed.elastic.events.handlers"></span><span class="target" id="module-torch.distributed.elastic.metrics.api"></span><span class="target" id="module-torch.distributed.elastic.multiprocessing.api"></span><span class="target" id="module-torch.distributed.elastic.multiprocessing.errors.error_handler"></span><span class="target" id="module-torch.distributed.elastic.multiprocessing.errors.handlers"></span><span class="target" id="module-torch.distributed.elastic.multiprocessing.redirects"></span><span class="target" id="module-torch.distributed.elastic.multiprocessing.tail_log"></span><span class="target" id="module-torch.distributed.elastic.rendezvous.api"></span><span class="target" id="module-torch.distributed.elastic.rendezvous.c10d_rendezvous_backend"></span><span class="target" id="module-torch.distributed.elastic.rendezvous.dynamic_rendezvous"></span><span class="target" id="module-torch.distributed.elastic.rendezvous.etcd_rendezvous"></span><span class="target" id="module-torch.distributed.elastic.rendezvous.etcd_rendezvous_backend"></span><span class="target" id="module-torch.distributed.elastic.rendezvous.etcd_server"></span><span class="target" id="module-torch.distributed.elastic.rendezvous.etcd_store"></span><span class="target" id="module-torch.distributed.elastic.rendezvous.static_tcp_rendezvous"></span><span class="target" id="module-torch.distributed.elastic.rendezvous.utils"></span><span class="target" id="module-torch.distributed.elastic.timer.api"></span><span class="target" id="module-torch.distributed.elastic.timer.file_based_local_timer"></span><span class="target" id="module-torch.distributed.elastic.timer.local_timer"></span><span class="target" id="module-torch.distributed.elastic.utils.api"></span><span class="target" id="module-torch.distributed.elastic.utils.data.cycling_iterator"></span><span class="target" id="module-torch.distributed.elastic.utils.data.elastic_distributed_sampler"></span><span class="target" id="module-torch.distributed.elastic.utils.distributed"></span><span class="target" id="module-torch.distributed.elastic.utils.log_level"></span><span class="target" id="module-torch.distributed.elastic.utils.logging"></span><span class="target" id="module-torch.distributed.elastic.utils.store"></span><span class="target" id="module-torch.distributed.fsdp.api"></span><span class="target" id="module-torch.distributed.fsdp.fully_sharded_data_parallel"></span><span class="target" id="module-torch.distributed.fsdp.sharded_grad_scaler"></span><span class="target" id="module-torch.distributed.fsdp.wrap"></span><span class="target" id="module-torch.distributed.launcher.api"></span><span class="target" id="module-torch.distributed.logging_handlers"></span><span class="target" id="module-torch.distributed.nn.api.remote_module"></span><span class="target" id="module-torch.distributed.nn.functional"></span><span class="target" id="module-torch.distributed.nn.jit.instantiator"></span><span class="target" id="module-torch.distributed.nn.jit.templates.remote_module_template"></span><span class="target" id="module-torch.distributed.optim.apply_optimizer_in_backward"></span><span class="target" id="module-torch.distributed.optim.functional_adadelta"></span><span class="target" id="module-torch.distributed.optim.functional_adagrad"></span><span class="target" id="module-torch.distributed.optim.functional_adam"></span><span class="target" id="module-torch.distributed.optim.functional_adamax"></span><span class="target" id="module-torch.distributed.optim.functional_adamw"></span><span class="target" id="module-torch.distributed.optim.functional_rmsprop"></span><span class="target" id="module-torch.distributed.optim.functional_rprop"></span><span class="target" id="module-torch.distributed.optim.functional_sgd"></span><span class="target" id="module-torch.distributed.optim.named_optimizer"></span><span class="target" id="module-torch.distributed.optim.optimizer"></span><span class="target" id="module-torch.distributed.optim.post_localSGD_optimizer"></span><span class="target" id="module-torch.distributed.optim.utils"></span><span class="target" id="module-torch.distributed.optim.zero_redundancy_optimizer"></span><span class="target" id="module-torch.distributed.remote_device"></span><span class="target" id="module-torch.distributed.rendezvous"></span><span class="target" id="module-torch.distributed.rpc.api"></span><span class="target" id="module-torch.distributed.rpc.backend_registry"></span><span class="target" id="module-torch.distributed.rpc.constants"></span><span class="target" id="module-torch.distributed.rpc.functions"></span><span class="target" id="module-torch.distributed.rpc.internal"></span><span class="target" id="module-torch.distributed.rpc.options"></span><span class="target" id="module-torch.distributed.rpc.rref_proxy"></span><span class="target" id="module-torch.distributed.rpc.server_process_global_profiler"></span><span class="target" id="module-torch.distributed.tensor.parallel.api"></span><span class="target" id="module-torch.distributed.tensor.parallel.ddp"></span><span class="target" id="module-torch.distributed.tensor.parallel.fsdp"></span><span class="target" id="module-torch.distributed.tensor.parallel.input_reshard"></span><span class="target" id="module-torch.distributed.tensor.parallel.loss"></span><span class="target" id="module-torch.distributed.tensor.parallel.style"></span><span class="target" id="module-torch.distributed.utils"></span><span class="target" id="module-torch.distributed.checkpoint.state_dict"></span></div>
</div>


             </article>
             
            </div>
            <footer>
  
    <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
      
        <a href="distributed.tensor.html" class="btn btn-neutral float-right" title="torch.distributed.tensor" accesskey="n" rel="next">Next <img src="_static/images/chevron-right-orange.svg" class="next-page"></a>
      
      
        <a href="torch.compiler_fake_tensor.html" class="btn btn-neutral" title="Fake tensor" accesskey="p" rel="prev"><img src="_static/images/chevron-right-orange.svg" class="previous-page"> Previous</a>
      
    </div>
  

    <hr>

  
  <div role="contentinfo">
    <p>
        &copy; Copyright 2023, PyTorch Contributors.

    </p>
  </div>
    
      <div>
        Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/rtfd/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
      </div>
     

</footer>

          </div>
<script>

var match = window.location.href.match(/\/_[a-zA-Z0-9_]*.html|_dynamo/gi);
var url = window.location.href.lastIndexOf(match[match.length-1]);

if (url)
  {
    var div = '<div class="admonition note"><p class="admonition-title">Note</p><p><i class="fa fa-exclamation-circle" aria-hidden="true">&nbsp</i> This page describes an internal API which is not intended to be used outside of the PyTorch codebase and can be modified or removed without notice.</p></div>'
    document.getElementById("pytorch-article").insertAdjacentHTML('afterBegin', div)
  }
</script>
        </div>

        <div class="pytorch-content-right" id="pytorch-content-right">
          <div class="pytorch-right-menu" id="pytorch-right-menu">
            <div class="pytorch-side-scroll" id="pytorch-side-scroll-right">
              <ul>
<li><a class="reference internal" href="#">Distributed communication package - torch.distributed</a><ul>
<li><a class="reference internal" href="#backends">Backends</a><ul>
<li><a class="reference internal" href="#backends-that-come-with-pytorch">Backends that come with PyTorch</a></li>
<li><a class="reference internal" href="#which-backend-to-use">Which backend to use?</a></li>
<li><a class="reference internal" href="#common-environment-variables">Common environment variables</a><ul>
<li><a class="reference internal" href="#choosing-the-network-interface-to-use">Choosing the network interface to use</a></li>
<li><a class="reference internal" href="#other-nccl-environment-variables">Other NCCL environment variables</a></li>
</ul>
</li>
</ul>
</li>
<li><a class="reference internal" href="#basics">Basics</a></li>
<li><a class="reference internal" href="#initialization">Initialization</a><ul>
<li><a class="reference internal" href="#torch.distributed.is_available"><code class="docutils literal notranslate"><span class="pre">is_available()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.init_process_group"><code class="docutils literal notranslate"><span class="pre">init_process_group()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.device_mesh.init_device_mesh"><code class="docutils literal notranslate"><span class="pre">init_device_mesh()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.is_initialized"><code class="docutils literal notranslate"><span class="pre">is_initialized()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.is_mpi_available"><code class="docutils literal notranslate"><span class="pre">is_mpi_available()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.is_nccl_available"><code class="docutils literal notranslate"><span class="pre">is_nccl_available()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.is_gloo_available"><code class="docutils literal notranslate"><span class="pre">is_gloo_available()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.is_torchelastic_launched"><code class="docutils literal notranslate"><span class="pre">is_torchelastic_launched()</span></code></a></li>
<li><a class="reference internal" href="#tcp-initialization">TCP initialization</a></li>
<li><a class="reference internal" href="#shared-file-system-initialization">Shared file-system initialization</a></li>
<li><a class="reference internal" href="#environment-variable-initialization">Environment variable initialization</a></li>
</ul>
</li>
<li><a class="reference internal" href="#post-initialization">Post-Initialization</a><ul>
<li><a class="reference internal" href="#torch.distributed.Backend"><code class="docutils literal notranslate"><span class="pre">Backend</span></code></a><ul>
<li><a class="reference internal" href="#torch.distributed.Backend.register_backend"><code class="docutils literal notranslate"><span class="pre">Backend.register_backend()</span></code></a></li>
</ul>
</li>
<li><a class="reference internal" href="#torch.distributed.get_backend"><code class="docutils literal notranslate"><span class="pre">get_backend()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.get_rank"><code class="docutils literal notranslate"><span class="pre">get_rank()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.get_world_size"><code class="docutils literal notranslate"><span class="pre">get_world_size()</span></code></a></li>
</ul>
</li>
<li><a class="reference internal" href="#shutdown">Shutdown</a><ul>
<li><a class="reference internal" href="#reinitialization">Reinitialization</a></li>
</ul>
</li>
<li><a class="reference internal" href="#distributed-key-value-store">Distributed Key-Value Store</a><ul>
<li><a class="reference internal" href="#torch.distributed.Store"><code class="docutils literal notranslate"><span class="pre">Store</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.TCPStore"><code class="docutils literal notranslate"><span class="pre">TCPStore</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.HashStore"><code class="docutils literal notranslate"><span class="pre">HashStore</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.FileStore"><code class="docutils literal notranslate"><span class="pre">FileStore</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.PrefixStore"><code class="docutils literal notranslate"><span class="pre">PrefixStore</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.Store.set"><code class="docutils literal notranslate"><span class="pre">set()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.Store.get"><code class="docutils literal notranslate"><span class="pre">get()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.Store.add"><code class="docutils literal notranslate"><span class="pre">add()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.Store.compare_set"><code class="docutils literal notranslate"><span class="pre">compare_set()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.Store.wait"><code class="docutils literal notranslate"><span class="pre">wait()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.Store.num_keys"><code class="docutils literal notranslate"><span class="pre">num_keys()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.Store.delete_key"><code class="docutils literal notranslate"><span class="pre">delete_key()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.Store.set_timeout"><code class="docutils literal notranslate"><span class="pre">set_timeout()</span></code></a></li>
</ul>
</li>
<li><a class="reference internal" href="#groups">Groups</a><ul>
<li><a class="reference internal" href="#torch.distributed.new_group"><code class="docutils literal notranslate"><span class="pre">new_group()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.get_group_rank"><code class="docutils literal notranslate"><span class="pre">get_group_rank()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.get_global_rank"><code class="docutils literal notranslate"><span class="pre">get_global_rank()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.get_process_group_ranks"><code class="docutils literal notranslate"><span class="pre">get_process_group_ranks()</span></code></a></li>
</ul>
</li>
<li><a class="reference internal" href="#devicemesh">DeviceMesh</a><ul>
<li><a class="reference internal" href="#torch.distributed.device_mesh.DeviceMesh"><code class="docutils literal notranslate"><span class="pre">DeviceMesh</span></code></a></li>
</ul>
</li>
<li><a class="reference internal" href="#point-to-point-communication">Point-to-point communication</a><ul>
<li><a class="reference internal" href="#torch.distributed.send"><code class="docutils literal notranslate"><span class="pre">send()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.recv"><code class="docutils literal notranslate"><span class="pre">recv()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.isend"><code class="docutils literal notranslate"><span class="pre">isend()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.irecv"><code class="docutils literal notranslate"><span class="pre">irecv()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.send_object_list"><code class="docutils literal notranslate"><span class="pre">send_object_list()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.recv_object_list"><code class="docutils literal notranslate"><span class="pre">recv_object_list()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.batch_isend_irecv"><code class="docutils literal notranslate"><span class="pre">batch_isend_irecv()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.P2POp"><code class="docutils literal notranslate"><span class="pre">P2POp</span></code></a></li>
</ul>
</li>
<li><a class="reference internal" href="#synchronous-and-asynchronous-collective-operations">Synchronous and asynchronous collective operations</a></li>
<li><a class="reference internal" href="#collective-functions">Collective functions</a><ul>
<li><a class="reference internal" href="#torch.distributed.broadcast"><code class="docutils literal notranslate"><span class="pre">broadcast()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.broadcast_object_list"><code class="docutils literal notranslate"><span class="pre">broadcast_object_list()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.all_reduce"><code class="docutils literal notranslate"><span class="pre">all_reduce()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.reduce"><code class="docutils literal notranslate"><span class="pre">reduce()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.all_gather"><code class="docutils literal notranslate"><span class="pre">all_gather()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.all_gather_into_tensor"><code class="docutils literal notranslate"><span class="pre">all_gather_into_tensor()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.all_gather_object"><code class="docutils literal notranslate"><span class="pre">all_gather_object()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.gather"><code class="docutils literal notranslate"><span class="pre">gather()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.gather_object"><code class="docutils literal notranslate"><span class="pre">gather_object()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.scatter"><code class="docutils literal notranslate"><span class="pre">scatter()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.scatter_object_list"><code class="docutils literal notranslate"><span class="pre">scatter_object_list()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.reduce_scatter"><code class="docutils literal notranslate"><span class="pre">reduce_scatter()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.reduce_scatter_tensor"><code class="docutils literal notranslate"><span class="pre">reduce_scatter_tensor()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.all_to_all_single"><code class="docutils literal notranslate"><span class="pre">all_to_all_single()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.all_to_all"><code class="docutils literal notranslate"><span class="pre">all_to_all()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.barrier"><code class="docutils literal notranslate"><span class="pre">barrier()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.monitored_barrier"><code class="docutils literal notranslate"><span class="pre">monitored_barrier()</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.Work"><code class="docutils literal notranslate"><span class="pre">Work</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.ReduceOp"><code class="docutils literal notranslate"><span class="pre">ReduceOp</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.reduce_op"><code class="docutils literal notranslate"><span class="pre">reduce_op</span></code></a></li>
</ul>
</li>
<li><a class="reference internal" href="#profiling-collective-communication">Profiling Collective Communication</a></li>
<li><a class="reference internal" href="#multi-gpu-collective-functions">Multi-GPU collective functions</a></li>
<li><a class="reference internal" href="#third-party-backends">Third-party backends</a></li>
<li><a class="reference internal" href="#launch-utility">Launch utility</a></li>
<li><a class="reference internal" href="#spawn-utility">Spawn utility</a></li>
<li><a class="reference internal" href="#debugging-torch-distributed-applications">Debugging <code class="docutils literal notranslate"><span class="pre">torch.distributed</span></code> applications</a><ul>
<li><a class="reference internal" href="#python-breakpoint">Python Breakpoint</a></li>
<li><a class="reference internal" href="#monitored-barrier">Monitored Barrier</a></li>
<li><a class="reference internal" href="#torch-distributed-debug"><code class="docutils literal notranslate"><span class="pre">TORCH_DISTRIBUTED_DEBUG</span></code></a></li>
</ul>
</li>
<li><a class="reference internal" href="#logging">Logging</a><ul>
<li><a class="reference internal" href="#torch.distributed.DistError"><code class="docutils literal notranslate"><span class="pre">DistError</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.DistBackendError"><code class="docutils literal notranslate"><span class="pre">DistBackendError</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.DistNetworkError"><code class="docutils literal notranslate"><span class="pre">DistNetworkError</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.DistStoreError"><code class="docutils literal notranslate"><span class="pre">DistStoreError</span></code></a></li>
<li><a class="reference internal" href="#torch.distributed.breakpoint"><code class="docutils literal notranslate"><span class="pre">breakpoint()</span></code></a></li>
</ul>
</li>
</ul>
</li>
</ul>

            </div>
          </div>
        </div>
      </section>
    </div>

  
       <script type="text/javascript" id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
         <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
         <script src="_static/jquery.js"></script>
         <script src="_static/underscore.js"></script>
         <script src="_static/_sphinx_javascript_frameworks_compat.js"></script>
         <script src="_static/doctools.js"></script>
         <script src="_static/sphinx_highlight.js"></script>
         <script src="_static/clipboard.min.js"></script>
         <script src="_static/copybutton.js"></script>
     

  <script type="text/javascript" src="_static/js/vendor/popper.min.js"></script>
  <script type="text/javascript" src="_static/js/vendor/bootstrap.min.js"></script>
  <script src="https://cdnjs.cloudflare.com/ajax/libs/list.js/1.5.0/list.min.js"></script>
  <script type="text/javascript" src="_static/js/theme.js"></script>

  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
  </script>
 
<script script type="text/javascript">
  var collapsedSections = ['Developer Notes', 'Language Bindings', 'Libraries', 'Community'];
</script>

<img height="1" width="1" style="border-style:none;" alt="" src="https://www.googleadservices.com/pagead/conversion/795629140/?label=txkmCPmdtosBENSssfsC&amp;guid=ON&amp;script=0"/>


  <!-- Begin Footer -->

  <div class="container-fluid docs-tutorials-resources" id="docs-tutorials-resources">
    <div class="container">
      <div class="row">
        <div class="col-md-4 text-center">
          <h2>Docs</h2>
          <p>Access comprehensive developer documentation for PyTorch</p>
          <a class="with-right-arrow" href="https://pytorch.org/docs/stable/index.html">View Docs</a>
        </div>

        <div class="col-md-4 text-center">
          <h2>Tutorials</h2>
          <p>Get in-depth tutorials for beginners and advanced developers</p>
          <a class="with-right-arrow" href="https://pytorch.org/tutorials">View Tutorials</a>
        </div>

        <div class="col-md-4 text-center">
          <h2>Resources</h2>
          <p>Find development resources and get your questions answered</p>
          <a class="with-right-arrow" href="https://pytorch.org/resources">View Resources</a>
        </div>
      </div>
    </div>
  </div>

  <footer class="site-footer">
    <div class="container footer-container">
      <div class="footer-logo-wrapper">
        <a href="https://pytorch.org/" class="footer-logo"></a>
      </div>

      <div class="footer-links-wrapper">
        <div class="footer-links-col">
          <ul>
            <li class="list-title"><a href="https://pytorch.org/">PyTorch</a></li>
            <li><a href="https://pytorch.org/get-started">Get Started</a></li>
            <li><a href="https://pytorch.org/features">Features</a></li>
            <li><a href="https://pytorch.org/ecosystem">Ecosystem</a></li>
            <li><a href="https://pytorch.org/blog/">Blog</a></li>
            <li><a href="https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md">Contributing</a></li>
          </ul>
        </div>

        <div class="footer-links-col">
          <ul>
            <li class="list-title"><a href="https://pytorch.org/resources">Resources</a></li>
            <li><a href="https://pytorch.org/tutorials">Tutorials</a></li>
            <li><a href="https://pytorch.org/docs/stable/index.html">Docs</a></li>
            <li><a href="https://discuss.pytorch.org" target="_blank">Discuss</a></li>
            <li><a href="https://github.com/pytorch/pytorch/issues" target="_blank">Github Issues</a></li>
            <li><a href="https://pytorch.org/assets/brand-guidelines/PyTorch-Brand-Guidelines.pdf" target="_blank">Brand Guidelines</a></li>
          </ul>
        </div>

        <div class="footer-links-col">
          <ul>
            <li class="list-title">Stay up to date</li>
            <li><a href="https://www.facebook.com/pytorch" target="_blank">Facebook</a></li>
            <li><a href="https://twitter.com/pytorch" target="_blank">Twitter</a></li>
            <li><a href="https://www.youtube.com/pytorch" target="_blank">YouTube</a></li>
            <li><a href="https://www.linkedin.com/company/pytorch" target="_blank">LinkedIn</a></li>
          </ul>  
          </div>

        <div class="footer-links-col">
          <ul>
            <li class="list-title">PyTorch Podcasts</li>
            <li><a href="https://open.spotify.com/show/6UzHKeiy368jKfQMKKvJY5" target="_blank">Spotify</a></li>
            <li><a href="https://podcasts.apple.com/us/podcast/pytorch-developer-podcast/id1566080008" target="_blank">Apple</a></li>
            <li><a href="https://www.google.com/podcasts?feed=aHR0cHM6Ly9mZWVkcy5zaW1wbGVjYXN0LmNvbS9PQjVGa0lsOA%3D%3D" target="_blank">Google</a></li>
            <li><a href="https://music.amazon.com/podcasts/7a4e6f0e-26c2-49e9-a478-41bd244197d0/PyTorch-Developer-Podcast?" target="_blank">Amazon</a></li>
          </ul>
         </div>
        </div>
        
        <div class="privacy-policy">
          <ul>
            <li class="privacy-policy-links"><a href="https://www.linuxfoundation.org/terms/" target="_blank">Terms</a></li>
            <li class="privacy-policy-links">|</li>
            <li class="privacy-policy-links"><a href="https://www.linuxfoundation.org/privacy-policy/" target="_blank">Privacy</a></li>
          </ul>
        </div>
        <div class="copyright">
        <p>© Copyright The Linux Foundation. The PyTorch Foundation is a project of The Linux Foundation.
          For web site terms of use, trademark policy and other policies applicable to The PyTorch Foundation please see
          <a href="https://www.linuxfoundation.org/policies/">www.linuxfoundation.org/policies/</a>. The PyTorch Foundation supports the PyTorch open source
          project, which has been established as PyTorch Project a Series of LF Projects, LLC. For policies applicable to the PyTorch Project a Series of LF Projects, LLC,
          please see <a href="https://www.lfprojects.org/policies/">www.lfprojects.org/policies/</a>.</p>
      </div>
     </div>

  </footer>

  <div class="cookie-banner-wrapper">
  <div class="container">
    <p class="gdpr-notice">To analyze traffic and optimize your experience, we serve cookies on this site. By clicking or navigating, you agree to allow our usage of cookies. As the current maintainers of this site, Facebook’s Cookies Policy applies. Learn more, including about available controls: <a href="https://www.facebook.com/policies/cookies/">Cookies Policy</a>.</p>
    <img class="close-button" src="_static/images/pytorch-x.svg">
  </div>
</div>

  <!-- End Footer -->

  <!-- Begin Mobile Menu -->

  <div class="mobile-main-menu">
    <div class="container-fluid">
      <div class="container">
        <div class="mobile-main-menu-header-container">
          <a class="header-logo" href="https://pytorch.org/" aria-label="PyTorch"></a>
          <a class="main-menu-close-button" href="#" data-behavior="close-mobile-menu"></a>
        </div>
      </div>
    </div>

    <div class="mobile-main-menu-links-container">
      <div class="main-menu">
        <ul>
           <li class="resources-mobile-menu-title">
             <a>Learn</a>
           </li>
           <ul class="resources-mobile-menu-items">
             <li>
               <a href="https://pytorch.org/get-started">Get Started</a>
             </li>
             <li>
               <a href="https://pytorch.org/tutorials">Tutorials</a>
             </li>
             <li>
               <a href="https://pytorch.org/tutorials/beginner/basics/intro.html">Learn the Basics</a>
             </li>
             <li>
               <a href="https://pytorch.org/tutorials/recipes/recipes_index.html">PyTorch Recipes</a>
             </li>
             <li>
               <a href="https://pytorch.org/tutorials/beginner/introyt.html">Introduction to PyTorch - YouTube Series</a>
             </li>
           </ul>
           <li class="resources-mobile-menu-title">
             <a>Ecosystem</a>
           </li>
           <ul class="resources-mobile-menu-items">
             <li>
               <a href="https://pytorch.org/ecosystem">Tools</a>
             </li>
             <li>
               <a href="https://pytorch.org/#community-module">Community</a>
             </li>
             <li>
               <a href="https://discuss.pytorch.org/">Forums</a>
             </li>
             <li>
               <a href="https://pytorch.org/resources">Developer Resources</a>
             </li>
             <li>
               <a href="https://pytorch.org/ecosystem/contributor-awards-2023">Contributor Awards - 2023</a>
             </li>
           </ul>

           <li class="resources-mobile-menu-title">
             <a>Edge</a>
           </li>

           <ul class="resources-mobile-menu-items">
             <li>
               <a href="https://pytorch.org/edge">About PyTorch Edge</a>
             </li>
             
             <li>
               <a href="https://pytorch.org/executorch-overview">ExecuTorch</a>
             </li>
           </ul>

           <li class="resources-mobile-menu-title">
             <a>Docs</a>
           </li>

           <ul class="resources-mobile-menu-items">
            <li>
              <a href="https://pytorch.org/docs/stable/index.html">PyTorch</a>
            </li>

            <li>
              <a href="https://pytorch.org/pytorch-domains">PyTorch Domains</a>
            </li>
          </ul>

          <li class="resources-mobile-menu-title">
            <a>Blog & News</a>
          </li>
            
           <ul class="resources-mobile-menu-items">
            <li>
              <a href="https://pytorch.org/blog/">PyTorch Blog</a>
            </li>
            <li>
              <a href="https://pytorch.org/community-blog">Community Blog</a>
            </li>

            <li>
              <a href="https://pytorch.org/videos">Videos</a>
            </li>

            <li>
              <a href="https://pytorch.org/community-stories">Community Stories</a>
            </li>
            <li>
              <a href="https://pytorch.org/events">Events</a>
            </li>
          </ul>
          
          <li class="resources-mobile-menu-title">
            <a>About</a>
          </li>

          <ul class="resources-mobile-menu-items">
            <li>
              <a href="https://pytorch.org/foundation">PyTorch Foundation</a>
            </li>
            <li>
              <a href="https://pytorch.org/governing-board">Governing Board</a>
            </li>
          </ul>
        </ul>
      </div>
    </div>
  </div>

  <!-- End Mobile Menu -->

  <script type="text/javascript" src="_static/js/vendor/anchor.min.js"></script>

  <script type="text/javascript">
    $(document).ready(function() {
      mobileMenu.bind();
      mobileTOC.bind();
      pytorchAnchors.bind();
      sideMenus.bind();
      scrollToAnchor.bind();
      highlightNavigation.bind();
      mainMenuDropdown.bind();
      filterTags.bind();

      // Add class to links that have code blocks, since we cannot create links in code blocks
      $("article.pytorch-article a span.pre").each(function(e) {
        $(this).closest("a").addClass("has-code");
      });
    })
  </script>
</body>
</html>