<!DOCTYPE html> <!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]--> <!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]--> <head> <meta name="robots" content="noindex"> <meta charset="utf-8"> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <title>CUDA semantics — PyTorch 1.7.0 documentation</title> <link rel="canonical" href="https://pytorch.org/docs/stable/notes/cuda.html"/> <link rel="stylesheet" href="../_static/css/theme.css" type="text/css" /> <!-- <link rel="stylesheet" href="../_static/pygments.css" type="text/css" /> --> <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css" type="text/css" /> <link rel="stylesheet" href="../_static/css/jit.css" type="text/css" /> <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.11.1/dist/katex.min.css" type="text/css" /> <link rel="stylesheet" href="../_static/katex-math.css" type="text/css" /> <link rel="index" title="Index" href="../genindex.html" /> <link rel="search" title="Search" href="../search.html" /> <link rel="next" title="Distributed Data Parallel" href="ddp.html" /> <link rel="prev" title="CPU threading and TorchScript inference" href="cpu_threading_torchscript_inference.html" /> <script src="../_static/js/modernizr.min.js"></script> <!-- Preload the theme fonts --> <link rel="preload" href="../_static/fonts/FreightSans/freight-sans-book.woff2" as="font" type="font/woff2" crossorigin="anonymous"> <link rel="preload" href="../_static/fonts/FreightSans/freight-sans-medium.woff2" as="font" type="font/woff2" crossorigin="anonymous"> <link rel="preload" href="../_static/fonts/IBMPlexMono/IBMPlexMono-Medium.woff2" as="font" type="font/woff2" crossorigin="anonymous"> <link rel="preload" href="../_static/fonts/FreightSans/freight-sans-bold.woff2" as="font" type="font/woff2" crossorigin="anonymous"> <link rel="preload" href="../_static/fonts/FreightSans/freight-sans-medium-italic.woff2" as="font" type="font/woff2" crossorigin="anonymous"> <link rel="preload" href="../_static/fonts/IBMPlexMono/IBMPlexMono-SemiBold.woff2" as="font" type="font/woff2" crossorigin="anonymous"> <!-- Preload the katex fonts --> <link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Math-Italic.woff2" as="font" type="font/woff2" crossorigin="anonymous"> <link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Main-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous"> <link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Main-Bold.woff2" as="font" type="font/woff2" crossorigin="anonymous"> <link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Size1-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous"> <link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Size4-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous"> <link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Size2-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous"> <link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Size3-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous"> <link rel="preload" href="https://cdn.jsdelivr.net/npm/katex@0.10.0/dist/fonts/KaTeX_Caligraphic-Regular.woff2" as="font" type="font/woff2" crossorigin="anonymous"> </head> <div class="container-fluid header-holder tutorials-header" id="header-holder"> <div class="container"> <div class="header-container"> <a class="header-logo" href="https://pytorch.org/" aria-label="PyTorch"></a> <div class="main-menu"> <ul> <li> <a href="https://pytorch.org/get-started">Get Started</a> </li> <li> <div class="ecosystem-dropdown"> <a id="dropdownMenuButton" data-toggle="ecosystem-dropdown"> Ecosystem </a> <div class="ecosystem-dropdown-menu"> <a class="nav-dropdown-item" href="https://pytorch.org/hub""> <span class=dropdown-title>Models (Beta)</span> <p>Discover, publish, and reuse pre-trained models</p> </a> <a class="nav-dropdown-item" href="https://pytorch.org/ecosystem"> <span class=dropdown-title>Tools & Libraries</span> <p>Explore the ecosystem of tools and libraries</p> </a> </div> </div> </li> <li> <a href="https://pytorch.org/mobile">Mobile</a> </li> <li> <a href="https://pytorch.org/blog/">Blog</a> </li> <li> <a href="https://pytorch.org/tutorials">Tutorials</a> </li> <li class="active"> <a href="https://pytorch.org/docs/stable/index.html">Docs</a> </li> <li> <div class="resources-dropdown"> <a id="resourcesDropdownButton" data-toggle="resources-dropdown"> Resources </a> <div class="resources-dropdown-menu"> <a class="nav-dropdown-item" href="https://pytorch.org/resources""> <span class=dropdown-title>Developer Resources</span> <p>Find resources and get questions answered</p> </a> <a class="nav-dropdown-item" href="https://pytorch.org/features"> <span class=dropdown-title>About</span> <p>Learn about PyTorch’s features and capabilities</p> </a> </div> </div> </li> <li> <a href="https://github.com/pytorch/pytorch">Github</a> </li> </ul> </div> <a class="main-menu-open-button" href="#" data-behavior="open-mobile-menu"></a> </div> </div> </div> <body class="pytorch-body"> <div class="table-of-contents-link-wrapper"> <span>Table of Contents</span> <a href="#" class="toggle-table-of-contents" data-behavior="toggle-table-of-contents"></a> </div> <nav data-toggle="wy-nav-shift" class="pytorch-left-menu" id="pytorch-left-menu"> <div class="pytorch-side-scroll"> <div class="pytorch-menu pytorch-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation"> <div class="pytorch-left-menu-search"> <div class="version"> <a href='http://pytorch.org/docs/versions.html'>1.7.0 ▼</a> </div> <div role="search"> <form id="rtd-search-form" class="wy-form" action="../search.html" method="get"> <input type="text" name="q" placeholder="Search Docs" /> <input type="hidden" name="check_keywords" value="yes" /> <input type="hidden" name="area" value="default" /> </form> </div> </div> <p class="caption"><span class="caption-text">Notes</span></p> <ul class="current"> <li class="toctree-l1"><a class="reference internal" href="amp_examples.html">Automatic Mixed Precision examples</a></li> <li class="toctree-l1"><a class="reference internal" href="autograd.html">Autograd mechanics</a></li> <li class="toctree-l1"><a class="reference internal" href="broadcasting.html">Broadcasting semantics</a></li> <li class="toctree-l1"><a class="reference internal" href="cpu_threading_torchscript_inference.html">CPU threading and TorchScript inference</a></li> <li class="toctree-l1 current"><a class="current reference internal" href="#">CUDA semantics</a></li> <li class="toctree-l1"><a class="reference internal" href="ddp.html">Distributed Data Parallel</a></li> <li class="toctree-l1"><a class="reference internal" href="extending.html">Extending PyTorch</a></li> <li class="toctree-l1"><a class="reference internal" href="faq.html">Frequently Asked Questions</a></li> <li class="toctree-l1"><a class="reference internal" href="large_scale_deployments.html">Features for large-scale deployments</a></li> <li class="toctree-l1"><a class="reference internal" href="multiprocessing.html">Multiprocessing best practices</a></li> <li class="toctree-l1"><a class="reference internal" href="randomness.html">Reproducibility</a></li> <li class="toctree-l1"><a class="reference internal" href="serialization.html">Serialization semantics</a></li> <li class="toctree-l1"><a class="reference internal" href="windows.html">Windows FAQ</a></li> </ul> <p class="caption"><span class="caption-text">Language Bindings</span></p> <ul> <li class="toctree-l1"><a class="reference internal" href="../cpp_index.html">C++</a></li> <li class="toctree-l1"><a class="reference external" href="https://pytorch.org/javadoc/">Javadoc</a></li> </ul> <p class="caption"><span class="caption-text">Python API</span></p> <ul> <li class="toctree-l1"><a class="reference internal" href="../torch.html">torch</a></li> <li class="toctree-l1"><a class="reference internal" href="../nn.html">torch.nn</a></li> <li class="toctree-l1"><a class="reference internal" href="../nn.functional.html">torch.nn.functional</a></li> <li class="toctree-l1"><a class="reference internal" href="../tensors.html">torch.Tensor</a></li> <li class="toctree-l1"><a class="reference internal" href="../tensor_attributes.html">Tensor Attributes</a></li> <li class="toctree-l1"><a class="reference internal" href="../tensor_view.html">Tensor Views</a></li> <li class="toctree-l1"><a class="reference internal" href="../autograd.html">torch.autograd</a></li> <li class="toctree-l1"><a class="reference internal" href="../cuda.html">torch.cuda</a></li> <li class="toctree-l1"><a class="reference internal" href="../amp.html">torch.cuda.amp</a></li> <li class="toctree-l1"><a class="reference internal" href="../backends.html">torch.backends</a></li> <li class="toctree-l1"><a class="reference internal" href="../distributed.html">torch.distributed</a></li> <li class="toctree-l1"><a class="reference internal" href="../distributions.html">torch.distributions</a></li> <li class="toctree-l1"><a class="reference internal" href="../fft.html">torch.fft</a></li> <li class="toctree-l1"><a class="reference internal" href="../futures.html">torch.futures</a></li> <li class="toctree-l1"><a class="reference internal" href="../hub.html">torch.hub</a></li> <li class="toctree-l1"><a class="reference internal" href="../jit.html">torch.jit</a></li> <li class="toctree-l1"><a class="reference internal" href="../linalg.html">torch.linalg</a></li> <li class="toctree-l1"><a class="reference internal" href="../nn.init.html">torch.nn.init</a></li> <li class="toctree-l1"><a class="reference internal" href="../onnx.html">torch.onnx</a></li> <li class="toctree-l1"><a class="reference internal" href="../optim.html">torch.optim</a></li> <li class="toctree-l1"><a class="reference internal" href="../complex_numbers.html">Complex Numbers</a></li> <li class="toctree-l1"><a class="reference internal" href="../quantization.html">Quantization</a></li> <li class="toctree-l1"><a class="reference internal" href="../rpc.html">Distributed RPC Framework</a></li> <li class="toctree-l1"><a class="reference internal" href="../random.html">torch.random</a></li> <li class="toctree-l1"><a class="reference internal" href="../sparse.html">torch.sparse</a></li> <li class="toctree-l1"><a class="reference internal" href="../storage.html">torch.Storage</a></li> <li class="toctree-l1"><a class="reference internal" href="../bottleneck.html">torch.utils.bottleneck</a></li> <li class="toctree-l1"><a class="reference internal" href="../checkpoint.html">torch.utils.checkpoint</a></li> <li class="toctree-l1"><a class="reference internal" href="../cpp_extension.html">torch.utils.cpp_extension</a></li> <li class="toctree-l1"><a class="reference internal" href="../data.html">torch.utils.data</a></li> <li class="toctree-l1"><a class="reference internal" href="../dlpack.html">torch.utils.dlpack</a></li> <li class="toctree-l1"><a class="reference internal" href="../mobile_optimizer.html">torch.utils.mobile_optimizer</a></li> <li class="toctree-l1"><a class="reference internal" href="../model_zoo.html">torch.utils.model_zoo</a></li> <li class="toctree-l1"><a class="reference internal" href="../tensorboard.html">torch.utils.tensorboard</a></li> <li class="toctree-l1"><a class="reference internal" href="../type_info.html">Type Info</a></li> <li class="toctree-l1"><a class="reference internal" href="../named_tensor.html">Named Tensors</a></li> <li class="toctree-l1"><a class="reference internal" href="../name_inference.html">Named Tensors operator coverage</a></li> <li class="toctree-l1"><a class="reference internal" href="../__config__.html">torch.__config__</a></li> </ul> <p class="caption"><span class="caption-text">Libraries</span></p> <ul> <li class="toctree-l1"><a class="reference external" href="https://pytorch.org/audio">torchaudio</a></li> <li class="toctree-l1"><a class="reference external" href="https://pytorch.org/text">torchtext</a></li> <li class="toctree-l1"><a class="reference external" href="https://pytorch.org/vision">torchvision</a></li> <li class="toctree-l1"><a class="reference external" href="https://pytorch.org/elastic/">TorchElastic</a></li> <li class="toctree-l1"><a class="reference external" href="https://pytorch.org/serve">TorchServe</a></li> <li class="toctree-l1"><a class="reference external" href="http://pytorch.org/xla/">PyTorch on XLA Devices</a></li> </ul> <p class="caption"><span class="caption-text">Community</span></p> <ul> <li class="toctree-l1"><a class="reference internal" href="../community/contribution_guide.html">PyTorch Contribution Guide</a></li> <li class="toctree-l1"><a class="reference internal" href="../community/governance.html">PyTorch Governance</a></li> <li class="toctree-l1"><a class="reference internal" href="../community/persons_of_interest.html">PyTorch Governance | Persons of Interest</a></li> </ul> </div> </div> </nav> <div class="pytorch-container"> <div class="pytorch-page-level-bar" id="pytorch-page-level-bar"> <div class="pytorch-breadcrumbs-wrapper"> <div role="navigation" aria-label="breadcrumbs navigation"> <ul class="pytorch-breadcrumbs"> <li> <a href="../index.html"> Docs </a> > </li> <li>CUDA semantics</li> <li class="pytorch-breadcrumbs-aside"> <a href="../_sources/notes/cuda.rst.txt" rel="nofollow"><img src="../_static/images/view-page-source-icon.svg"></a> </li> </ul> </div> </div> <div class="pytorch-shortcuts-wrapper" id="pytorch-shortcuts-wrapper"> Shortcuts </div> </div> <section data-toggle="wy-nav-shift" id="pytorch-content-wrap" class="pytorch-content-wrap"> <div class="pytorch-content-left"> <div class="rst-content"> <div role="main" class="main-content" itemscope="itemscope" itemtype="http://schema.org/Article"> <article itemprop="articleBody" id="pytorch-article" class="pytorch-article"> <div class="section" id="cuda-semantics"> <span id="id1"></span><h1>CUDA semantics<a class="headerlink" href="#cuda-semantics" title="Permalink to this headline">¶</a></h1> <p><a class="reference internal" href="../cuda.html#module-torch.cuda" title="torch.cuda"><code class="xref py py-mod docutils literal notranslate"><span class="pre">torch.cuda</span></code></a> is used to set up and run CUDA operations. It keeps track of the currently selected GPU, and all CUDA tensors you allocate will by default be created on that device. The selected device can be changed with a <a class="reference internal" href="../cuda.html#torch.cuda.device" title="torch.cuda.device"><code class="xref any py py-class docutils literal notranslate"><span class="pre">torch.cuda.device</span></code></a> context manager.</p> <p>However, once a tensor is allocated, you can do operations on it irrespective of the selected device, and the results will be always placed in on the same device as the tensor.</p> <p>Cross-GPU operations are not allowed by default, with the exception of <a class="reference internal" href="../tensors.html#torch.Tensor.copy_" title="torch.Tensor.copy_"><code class="xref py py-meth docutils literal notranslate"><span class="pre">copy_()</span></code></a> and other methods with copy-like functionality such as <a class="reference internal" href="../tensors.html#torch.Tensor.to" title="torch.Tensor.to"><code class="xref py py-meth docutils literal notranslate"><span class="pre">to()</span></code></a> and <a class="reference internal" href="../tensors.html#torch.Tensor.cuda" title="torch.Tensor.cuda"><code class="xref py py-meth docutils literal notranslate"><span class="pre">cuda()</span></code></a>. Unless you enable peer-to-peer memory access, any attempts to launch ops on tensors spread across different devices will raise an error.</p> <p>Below you can find a small example showcasing this:</p> <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">cuda</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">device</span><span class="p">(</span><span class="s1">'cuda'</span><span class="p">)</span> <span class="c1"># Default CUDA device</span> <span class="n">cuda0</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">device</span><span class="p">(</span><span class="s1">'cuda:0'</span><span class="p">)</span> <span class="n">cuda2</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">device</span><span class="p">(</span><span class="s1">'cuda:2'</span><span class="p">)</span> <span class="c1"># GPU 2 (these are 0-indexed)</span> <span class="n">x</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">tensor</span><span class="p">([</span><span class="mf">1.</span><span class="p">,</span> <span class="mf">2.</span><span class="p">],</span> <span class="n">device</span><span class="o">=</span><span class="n">cuda0</span><span class="p">)</span> <span class="c1"># x.device is device(type='cuda', index=0)</span> <span class="n">y</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">tensor</span><span class="p">([</span><span class="mf">1.</span><span class="p">,</span> <span class="mf">2.</span><span class="p">])</span><span class="o">.</span><span class="n">cuda</span><span class="p">()</span> <span class="c1"># y.device is device(type='cuda', index=0)</span> <span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">device</span><span class="p">(</span><span class="mi">1</span><span class="p">):</span> <span class="c1"># allocates a tensor on GPU 1</span> <span class="n">a</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">tensor</span><span class="p">([</span><span class="mf">1.</span><span class="p">,</span> <span class="mf">2.</span><span class="p">],</span> <span class="n">device</span><span class="o">=</span><span class="n">cuda</span><span class="p">)</span> <span class="c1"># transfers a tensor from CPU to GPU 1</span> <span class="n">b</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">tensor</span><span class="p">([</span><span class="mf">1.</span><span class="p">,</span> <span class="mf">2.</span><span class="p">])</span><span class="o">.</span><span class="n">cuda</span><span class="p">()</span> <span class="c1"># a.device and b.device are device(type='cuda', index=1)</span> <span class="c1"># You can also use ``Tensor.to`` to transfer a tensor:</span> <span class="n">b2</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">tensor</span><span class="p">([</span><span class="mf">1.</span><span class="p">,</span> <span class="mf">2.</span><span class="p">])</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="o">=</span><span class="n">cuda</span><span class="p">)</span> <span class="c1"># b.device and b2.device are device(type='cuda', index=1)</span> <span class="n">c</span> <span class="o">=</span> <span class="n">a</span> <span class="o">+</span> <span class="n">b</span> <span class="c1"># c.device is device(type='cuda', index=1)</span> <span class="n">z</span> <span class="o">=</span> <span class="n">x</span> <span class="o">+</span> <span class="n">y</span> <span class="c1"># z.device is device(type='cuda', index=0)</span> <span class="c1"># even within a context, you can specify the device</span> <span class="c1"># (or give a GPU index to the .cuda call)</span> <span class="n">d</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">cuda2</span><span class="p">)</span> <span class="n">e</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">cuda2</span><span class="p">)</span> <span class="n">f</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">cuda</span><span class="p">(</span><span class="n">cuda2</span><span class="p">)</span> <span class="c1"># d.device, e.device, and f.device are all device(type='cuda', index=2)</span> </pre></div> </div> <div class="section" id="tensorfloat-32-tf32-on-ampere-devices"> <span id="tf32-on-ampere"></span><h2>TensorFloat-32(TF32) on Ampere devices<a class="headerlink" href="#tensorfloat-32-tf32-on-ampere-devices" title="Permalink to this headline">¶</a></h2> <p>Starting in PyTorch 1.7, there is a new flag called <cite>allow_tf32</cite> which defaults to true. This flag controls whether PyTorch is allowed to use the TensorFloat32 (TF32) tensor cores, available on new NVIDIA GPUs since Ampere, internally to compute matmul (matrix multiplies and batched matrix multiplies) and convolutions.</p> <p>TF32 tensor cores are designed to achieve better performance on matmul and convolutions on <cite>torch.float32</cite> tensors by truncating input data to have 10 bits of mantissa, and accumulating results with FP32 precision, maintaining FP32 dynamic range.</p> <p>matmuls and convolutions are controlled separately, and their corresponding flags can be accessed at:</p> <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># The flag below controls whether to allow TF32 on matmul. This flag defaults to True.</span> <span class="n">torch</span><span class="o">.</span><span class="n">backends</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">matmul</span><span class="o">.</span><span class="n">allow_tf32</span> <span class="o">=</span> <span class="kc">True</span> <span class="c1"># The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.</span> <span class="n">torch</span><span class="o">.</span><span class="n">backends</span><span class="o">.</span><span class="n">cudnn</span><span class="o">.</span><span class="n">allow_tf32</span> <span class="o">=</span> <span class="kc">True</span> </pre></div> </div> <p>Note that besides matmuls and convolutions themselves, functions and nn modules that internally uses matmuls or convolutions are also affected. These include <cite>nn.Linear</cite>, <cite>nn.Conv*</cite>, cdist, tensordot, affine grid and grid sample, adaptive log softmax, GRU and LSTM.</p> <p>To get an idea of the precision and speed, see the example code below:</p> <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">a_full</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">10240</span><span class="p">,</span> <span class="mi">10240</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">double</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s1">'cuda'</span><span class="p">)</span> <span class="n">b_full</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">10240</span><span class="p">,</span> <span class="mi">10240</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">double</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s1">'cuda'</span><span class="p">)</span> <span class="n">ab_full</span> <span class="o">=</span> <span class="n">a_full</span> <span class="o">@</span> <span class="n">b_full</span> <span class="n">mean</span> <span class="o">=</span> <span class="n">ab_full</span><span class="o">.</span><span class="n">abs</span><span class="p">()</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span> <span class="c1"># 80.7277</span> <span class="n">a</span> <span class="o">=</span> <span class="n">a_full</span><span class="o">.</span><span class="n">float</span><span class="p">()</span> <span class="n">b</span> <span class="o">=</span> <span class="n">b_full</span><span class="o">.</span><span class="n">float</span><span class="p">()</span> <span class="c1"># Do matmul at TF32 mode.</span> <span class="n">ab_tf32</span> <span class="o">=</span> <span class="n">a</span> <span class="o">@</span> <span class="n">b</span> <span class="c1"># takes 0.016s on GA100</span> <span class="n">error</span> <span class="o">=</span> <span class="p">(</span><span class="n">ab_tf32</span> <span class="o">-</span> <span class="n">ab_full</span><span class="p">)</span><span class="o">.</span><span class="n">abs</span><span class="p">()</span><span class="o">.</span><span class="n">max</span><span class="p">()</span> <span class="c1"># 0.1747</span> <span class="n">relative_error</span> <span class="o">=</span> <span class="n">error</span> <span class="o">/</span> <span class="n">mean</span> <span class="c1"># 0.0022</span> <span class="c1"># Do matmul with TF32 disabled.</span> <span class="n">torch</span><span class="o">.</span><span class="n">backends</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">matmul</span><span class="o">.</span><span class="n">allow_tf32</span> <span class="o">=</span> <span class="kc">False</span> <span class="n">ab_fp32</span> <span class="o">=</span> <span class="n">a</span> <span class="o">@</span> <span class="n">b</span> <span class="c1"># takes 0.11s on GA100</span> <span class="n">error</span> <span class="o">=</span> <span class="p">(</span><span class="n">ab_fp32</span> <span class="o">-</span> <span class="n">ab_full</span><span class="p">)</span><span class="o">.</span><span class="n">abs</span><span class="p">()</span><span class="o">.</span><span class="n">max</span><span class="p">()</span> <span class="c1"># 0.0031</span> <span class="n">relative_error</span> <span class="o">=</span> <span class="n">error</span> <span class="o">/</span> <span class="n">mean</span> <span class="c1"># 0.000039</span> </pre></div> </div> <p>From the above example, we can see that with TF32 enabled, the speed is ~7x faster, relative error compared to double precision is approximately 2 orders of magnitude larger. If the full FP32 precision is needed, users can disable TF32 by:</p> <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">torch</span><span class="o">.</span><span class="n">backends</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">matmul</span><span class="o">.</span><span class="n">allow_tf32</span> <span class="o">=</span> <span class="kc">False</span> <span class="n">torch</span><span class="o">.</span><span class="n">backends</span><span class="o">.</span><span class="n">cudnn</span><span class="o">.</span><span class="n">allow_tf32</span> <span class="o">=</span> <span class="kc">False</span> </pre></div> </div> <p>For more information about TF32, see:</p> <ul class="simple"> <li><p><a class="reference external" href="https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/">TensorFloat-32</a></p></li> <li><p><a class="reference external" href="https://devblogs.nvidia.com/cuda-11-features-revealed/">CUDA 11</a></p></li> <li><p><a class="reference external" href="https://devblogs.nvidia.com/nvidia-ampere-architecture-in-depth/">Ampere architecture</a></p></li> </ul> </div> <div class="section" id="asynchronous-execution"> <h2>Asynchronous execution<a class="headerlink" href="#asynchronous-execution" title="Permalink to this headline">¶</a></h2> <p>By default, GPU operations are asynchronous. When you call a function that uses the GPU, the operations are <em>enqueued</em> to the particular device, but not necessarily executed until later. This allows us to execute more computations in parallel, including operations on CPU or other GPUs.</p> <p>In general, the effect of asynchronous computation is invisible to the caller, because (1) each device executes operations in the order they are queued, and (2) PyTorch automatically performs necessary synchronization when copying data between CPU and GPU or between two GPUs. Hence, computation will proceed as if every operation was executed synchronously.</p> <p>You can force synchronous computation by setting environment variable <code class="docutils literal notranslate"><span class="pre">CUDA_LAUNCH_BLOCKING=1</span></code>. This can be handy when an error occurs on the GPU. (With asynchronous execution, such an error isn’t reported until after the operation is actually executed, so the stack trace does not show where it was requested.)</p> <p>A consequence of the asynchronous computation is that time measurements without synchronizations are not accurate. To get precise measurements, one should either call <a class="reference internal" href="../cuda.html#torch.cuda.synchronize" title="torch.cuda.synchronize"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.cuda.synchronize()</span></code></a> before measuring, or use <a class="reference internal" href="../cuda.html#torch.cuda.Event" title="torch.cuda.Event"><code class="xref py py-class docutils literal notranslate"><span class="pre">torch.cuda.Event</span></code></a> to record times as following:</p> <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">start_event</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">Event</span><span class="p">(</span><span class="n">enable_timing</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> <span class="n">end_event</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">Event</span><span class="p">(</span><span class="n">enable_timing</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> <span class="n">start_event</span><span class="o">.</span><span class="n">record</span><span class="p">()</span> <span class="c1"># Run some things here</span> <span class="n">end_event</span><span class="o">.</span><span class="n">record</span><span class="p">()</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">synchronize</span><span class="p">()</span> <span class="c1"># Wait for the events to be recorded!</span> <span class="n">elapsed_time_ms</span> <span class="o">=</span> <span class="n">start_event</span><span class="o">.</span><span class="n">elapsed_time</span><span class="p">(</span><span class="n">end_event</span><span class="p">)</span> </pre></div> </div> <p>As an exception, several functions such as <a class="reference internal" href="../tensors.html#torch.Tensor.to" title="torch.Tensor.to"><code class="xref py py-meth docutils literal notranslate"><span class="pre">to()</span></code></a> and <a class="reference internal" href="../tensors.html#torch.Tensor.copy_" title="torch.Tensor.copy_"><code class="xref py py-meth docutils literal notranslate"><span class="pre">copy_()</span></code></a> admit an explicit <code class="xref py py-attr docutils literal notranslate"><span class="pre">non_blocking</span></code> argument, which lets the caller bypass synchronization when it is unnecessary. Another exception is CUDA streams, explained below.</p> <div class="section" id="cuda-streams"> <h3>CUDA streams<a class="headerlink" href="#cuda-streams" title="Permalink to this headline">¶</a></h3> <p>A <a class="reference external" href="https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#streams">CUDA stream</a> is a linear sequence of execution that belongs to a specific device. You normally do not need to create one explicitly: by default, each device uses its own “default” stream.</p> <p>Operations inside each stream are serialized in the order they are created, but operations from different streams can execute concurrently in any relative order, unless explicit synchronization functions (such as <a class="reference internal" href="../cuda.html#torch.cuda.synchronize" title="torch.cuda.synchronize"><code class="xref py py-meth docutils literal notranslate"><span class="pre">synchronize()</span></code></a> or <a class="reference internal" href="../cuda.html#torch.cuda.Stream.wait_stream" title="torch.cuda.Stream.wait_stream"><code class="xref py py-meth docutils literal notranslate"><span class="pre">wait_stream()</span></code></a>) are used. For example, the following code is incorrect:</p> <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">cuda</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">device</span><span class="p">(</span><span class="s1">'cuda'</span><span class="p">)</span> <span class="n">s</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">Stream</span><span class="p">()</span> <span class="c1"># Create a new stream.</span> <span class="n">A</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">empty</span><span class="p">((</span><span class="mi">100</span><span class="p">,</span> <span class="mi">100</span><span class="p">),</span> <span class="n">device</span><span class="o">=</span><span class="n">cuda</span><span class="p">)</span><span class="o">.</span><span class="n">normal_</span><span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">)</span> <span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">stream</span><span class="p">(</span><span class="n">s</span><span class="p">):</span> <span class="c1"># sum() may start execution before normal_() finishes!</span> <span class="n">B</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">A</span><span class="p">)</span> </pre></div> </div> <p>When the “current stream” is the default stream, PyTorch automatically performs necessary synchronization when data is moved around, as explained above. However, when using non-default streams, it is the user’s responsibility to ensure proper synchronization.</p> </div> </div> <div class="section" id="memory-management"> <span id="cuda-memory-management"></span><h2>Memory management<a class="headerlink" href="#memory-management" title="Permalink to this headline">¶</a></h2> <p>PyTorch uses a caching memory allocator to speed up memory allocations. This allows fast memory deallocation without device synchronizations. However, the unused memory managed by the allocator will still show as if used in <code class="docutils literal notranslate"><span class="pre">nvidia-smi</span></code>. You can use <a class="reference internal" href="../cuda.html#torch.cuda.memory_allocated" title="torch.cuda.memory_allocated"><code class="xref py py-meth docutils literal notranslate"><span class="pre">memory_allocated()</span></code></a> and <a class="reference internal" href="../cuda.html#torch.cuda.max_memory_allocated" title="torch.cuda.max_memory_allocated"><code class="xref py py-meth docutils literal notranslate"><span class="pre">max_memory_allocated()</span></code></a> to monitor memory occupied by tensors, and use <a class="reference internal" href="../cuda.html#torch.cuda.memory_reserved" title="torch.cuda.memory_reserved"><code class="xref py py-meth docutils literal notranslate"><span class="pre">memory_reserved()</span></code></a> and <a class="reference internal" href="../cuda.html#torch.cuda.max_memory_reserved" title="torch.cuda.max_memory_reserved"><code class="xref py py-meth docutils literal notranslate"><span class="pre">max_memory_reserved()</span></code></a> to monitor the total amount of memory managed by the caching allocator. Calling <a class="reference internal" href="../cuda.html#torch.cuda.empty_cache" title="torch.cuda.empty_cache"><code class="xref py py-meth docutils literal notranslate"><span class="pre">empty_cache()</span></code></a> releases all <strong>unused</strong> cached memory from PyTorch so that those can be used by other GPU applications. However, the occupied GPU memory by tensors will not be freed so it can not increase the amount of GPU memory available for PyTorch.</p> <p>For more advanced users, we offer more comprehensive memory benchmarking via <a class="reference internal" href="../cuda.html#torch.cuda.memory_stats" title="torch.cuda.memory_stats"><code class="xref py py-meth docutils literal notranslate"><span class="pre">memory_stats()</span></code></a>. We also offer the capability to capture a complete snapshot of the memory allocator state via <a class="reference internal" href="../cuda.html#torch.cuda.memory_snapshot" title="torch.cuda.memory_snapshot"><code class="xref py py-meth docutils literal notranslate"><span class="pre">memory_snapshot()</span></code></a>, which can help you understand the underlying allocation patterns produced by your code.</p> <p>Use of a caching allocator can interfere with memory checking tools such as <code class="docutils literal notranslate"><span class="pre">cuda-memcheck</span></code>. To debug memory errors using <code class="docutils literal notranslate"><span class="pre">cuda-memcheck</span></code>, set <code class="docutils literal notranslate"><span class="pre">PYTORCH_NO_CUDA_MEMORY_CACHING=1</span></code> in your environment to disable caching.</p> </div> <div class="section" id="cufft-plan-cache"> <span id="id2"></span><h2>cuFFT plan cache<a class="headerlink" href="#cufft-plan-cache" title="Permalink to this headline">¶</a></h2> <p>For each CUDA device, an LRU cache of cuFFT plans is used to speed up repeatedly running FFT methods (e.g., <a class="reference internal" href="../generated/torch.fft.html#torch.fft" title="torch.fft"><code class="xref py py-func docutils literal notranslate"><span class="pre">torch.fft()</span></code></a>) on CUDA tensors of same geometry with same configuration. Because some cuFFT plans may allocate GPU memory, these caches have a maximum capacity.</p> <p>You may control and query the properties of the cache of current device with the following APIs:</p> <ul class="simple"> <li><p><code class="docutils literal notranslate"><span class="pre">torch.backends.cuda.cufft_plan_cache.max_size</span></code> gives the capacity of the cache (default is 4096 on CUDA 10 and newer, and 1023 on older CUDA versions). Setting this value directly modifies the capacity.</p></li> <li><p><code class="docutils literal notranslate"><span class="pre">torch.backends.cuda.cufft_plan_cache.size</span></code> gives the number of plans currently residing in the cache.</p></li> <li><p><code class="docutils literal notranslate"><span class="pre">torch.backends.cuda.cufft_plan_cache.clear()</span></code> clears the cache.</p></li> </ul> <p>To control and query plan caches of a non-default device, you can index the <code class="docutils literal notranslate"><span class="pre">torch.backends.cuda.cufft_plan_cache</span></code> object with either a <code class="xref py py-class docutils literal notranslate"><span class="pre">torch.device</span></code> object or a device index, and access one of the above attributes. E.g., to set the capacity of the cache for device <code class="docutils literal notranslate"><span class="pre">1</span></code>, one can write <code class="docutils literal notranslate"><span class="pre">torch.backends.cuda.cufft_plan_cache[1].max_size</span> <span class="pre">=</span> <span class="pre">10</span></code>.</p> </div> <div class="section" id="best-practices"> <h2>Best practices<a class="headerlink" href="#best-practices" title="Permalink to this headline">¶</a></h2> <div class="section" id="device-agnostic-code"> <h3>Device-agnostic code<a class="headerlink" href="#device-agnostic-code" title="Permalink to this headline">¶</a></h3> <p>Due to the structure of PyTorch, you may need to explicitly write device-agnostic (CPU or GPU) code; an example may be creating a new tensor as the initial hidden state of a recurrent neural network.</p> <p>The first step is to determine whether the GPU should be used or not. A common pattern is to use Python’s <code class="docutils literal notranslate"><span class="pre">argparse</span></code> module to read in user arguments, and have a flag that can be used to disable CUDA, in combination with <a class="reference internal" href="../cuda.html#torch.cuda.is_available" title="torch.cuda.is_available"><code class="xref py py-meth docutils literal notranslate"><span class="pre">is_available()</span></code></a>. In the following, <code class="docutils literal notranslate"><span class="pre">args.device</span></code> results in a <code class="xref py py-class docutils literal notranslate"><span class="pre">torch.device</span></code> object that can be used to move tensors to CPU or CUDA.</p> <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">argparse</span> <span class="kn">import</span> <span class="nn">torch</span> <span class="n">parser</span> <span class="o">=</span> <span class="n">argparse</span><span class="o">.</span><span class="n">ArgumentParser</span><span class="p">(</span><span class="n">description</span><span class="o">=</span><span class="s1">'PyTorch Example'</span><span class="p">)</span> <span class="n">parser</span><span class="o">.</span><span class="n">add_argument</span><span class="p">(</span><span class="s1">'--disable-cuda'</span><span class="p">,</span> <span class="n">action</span><span class="o">=</span><span class="s1">'store_true'</span><span class="p">,</span> <span class="n">help</span><span class="o">=</span><span class="s1">'Disable CUDA'</span><span class="p">)</span> <span class="n">args</span> <span class="o">=</span> <span class="n">parser</span><span class="o">.</span><span class="n">parse_args</span><span class="p">()</span> <span class="n">args</span><span class="o">.</span><span class="n">device</span> <span class="o">=</span> <span class="kc">None</span> <span class="k">if</span> <span class="ow">not</span> <span class="n">args</span><span class="o">.</span><span class="n">disable_cuda</span> <span class="ow">and</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">is_available</span><span class="p">():</span> <span class="n">args</span><span class="o">.</span><span class="n">device</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">device</span><span class="p">(</span><span class="s1">'cuda'</span><span class="p">)</span> <span class="k">else</span><span class="p">:</span> <span class="n">args</span><span class="o">.</span><span class="n">device</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">device</span><span class="p">(</span><span class="s1">'cpu'</span><span class="p">)</span> </pre></div> </div> <p>Now that we have <code class="docutils literal notranslate"><span class="pre">args.device</span></code>, we can use it to create a Tensor on the desired device.</p> <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">x</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">empty</span><span class="p">((</span><span class="mi">8</span><span class="p">,</span> <span class="mi">42</span><span class="p">),</span> <span class="n">device</span><span class="o">=</span><span class="n">args</span><span class="o">.</span><span class="n">device</span><span class="p">)</span> <span class="n">net</span> <span class="o">=</span> <span class="n">Network</span><span class="p">()</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="o">=</span><span class="n">args</span><span class="o">.</span><span class="n">device</span><span class="p">)</span> </pre></div> </div> <p>This can be used in a number of cases to produce device agnostic code. Below is an example when using a dataloader:</p> <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">cuda0</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">device</span><span class="p">(</span><span class="s1">'cuda:0'</span><span class="p">)</span> <span class="c1"># CUDA GPU 0</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">x</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">train_loader</span><span class="p">):</span> <span class="n">x</span> <span class="o">=</span> <span class="n">x</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">cuda0</span><span class="p">)</span> </pre></div> </div> <p>When working with multiple GPUs on a system, you can use the <code class="docutils literal notranslate"><span class="pre">CUDA_VISIBLE_DEVICES</span></code> environment flag to manage which GPUs are available to PyTorch. As mentioned above, to manually control which GPU a tensor is created on, the best practice is to use a <a class="reference internal" href="../cuda.html#torch.cuda.device" title="torch.cuda.device"><code class="xref any py py-class docutils literal notranslate"><span class="pre">torch.cuda.device</span></code></a> context manager.</p> <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="nb">print</span><span class="p">(</span><span class="s2">"Outside device is 0"</span><span class="p">)</span> <span class="c1"># On device 0 (default in most scenarios)</span> <span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">device</span><span class="p">(</span><span class="mi">1</span><span class="p">):</span> <span class="nb">print</span><span class="p">(</span><span class="s2">"Inside device is 1"</span><span class="p">)</span> <span class="c1"># On device 1</span> <span class="nb">print</span><span class="p">(</span><span class="s2">"Outside device is still 0"</span><span class="p">)</span> <span class="c1"># On device 0</span> </pre></div> </div> <p>If you have a tensor and would like to create a new tensor of the same type on the same device, then you can use a <code class="docutils literal notranslate"><span class="pre">torch.Tensor.new_*</span></code> method (see <a class="reference internal" href="../tensors.html#torch.Tensor" title="torch.Tensor"><code class="xref py py-class docutils literal notranslate"><span class="pre">torch.Tensor</span></code></a>). Whilst the previously mentioned <code class="docutils literal notranslate"><span class="pre">torch.*</span></code> factory functions (<a class="reference internal" href="../torch.html#tensor-creation-ops"><span class="std std-ref">Creation Ops</span></a>) depend on the current GPU context and the attributes arguments you pass in, <code class="docutils literal notranslate"><span class="pre">torch.Tensor.new_*</span></code> methods preserve the device and other attributes of the tensor.</p> <p>This is the recommended practice when creating modules in which new tensors need to be created internally during the forward pass.</p> <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">cuda</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">device</span><span class="p">(</span><span class="s1">'cuda'</span><span class="p">)</span> <span class="n">x_cpu</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">empty</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span> <span class="n">x_gpu</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">empty</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">cuda</span><span class="p">)</span> <span class="n">x_cpu_long</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">empty</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">int64</span><span class="p">)</span> <span class="n">y_cpu</span> <span class="o">=</span> <span class="n">x_cpu</span><span class="o">.</span><span class="n">new_full</span><span class="p">([</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="n">fill_value</span><span class="o">=</span><span class="mf">0.3</span><span class="p">)</span> <span class="nb">print</span><span class="p">(</span><span class="n">y_cpu</span><span class="p">)</span> <span class="n">tensor</span><span class="p">([[</span> <span class="mf">0.3000</span><span class="p">,</span> <span class="mf">0.3000</span><span class="p">],</span> <span class="p">[</span> <span class="mf">0.3000</span><span class="p">,</span> <span class="mf">0.3000</span><span class="p">],</span> <span class="p">[</span> <span class="mf">0.3000</span><span class="p">,</span> <span class="mf">0.3000</span><span class="p">]])</span> <span class="n">y_gpu</span> <span class="o">=</span> <span class="n">x_gpu</span><span class="o">.</span><span class="n">new_full</span><span class="p">([</span><span class="mi">3</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="n">fill_value</span><span class="o">=-</span><span class="mi">5</span><span class="p">)</span> <span class="nb">print</span><span class="p">(</span><span class="n">y_gpu</span><span class="p">)</span> <span class="n">tensor</span><span class="p">([[</span><span class="o">-</span><span class="mf">5.0000</span><span class="p">,</span> <span class="o">-</span><span class="mf">5.0000</span><span class="p">],</span> <span class="p">[</span><span class="o">-</span><span class="mf">5.0000</span><span class="p">,</span> <span class="o">-</span><span class="mf">5.0000</span><span class="p">],</span> <span class="p">[</span><span class="o">-</span><span class="mf">5.0000</span><span class="p">,</span> <span class="o">-</span><span class="mf">5.0000</span><span class="p">]],</span> <span class="n">device</span><span class="o">=</span><span class="s1">'cuda:0'</span><span class="p">)</span> <span class="n">y_cpu_long</span> <span class="o">=</span> <span class="n">x_cpu_long</span><span class="o">.</span><span class="n">new_tensor</span><span class="p">([[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">]])</span> <span class="nb">print</span><span class="p">(</span><span class="n">y_cpu_long</span><span class="p">)</span> <span class="n">tensor</span><span class="p">([[</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">]])</span> </pre></div> </div> <p>If you want to create a tensor of the same type and size of another tensor, and fill it with either ones or zeros, <a class="reference internal" href="../generated/torch.ones_like.html#torch.ones_like" title="torch.ones_like"><code class="xref py py-meth docutils literal notranslate"><span class="pre">ones_like()</span></code></a> or <a class="reference internal" href="../generated/torch.zeros_like.html#torch.zeros_like" title="torch.zeros_like"><code class="xref py py-meth docutils literal notranslate"><span class="pre">zeros_like()</span></code></a> are provided as convenient helper functions (which also preserve <code class="xref py py-class docutils literal notranslate"><span class="pre">torch.device</span></code> and <code class="xref py py-class docutils literal notranslate"><span class="pre">torch.dtype</span></code> of a Tensor).</p> <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">x_cpu</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">empty</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">)</span> <span class="n">x_gpu</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">empty</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">)</span> <span class="n">y_cpu</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">ones_like</span><span class="p">(</span><span class="n">x_cpu</span><span class="p">)</span> <span class="n">y_gpu</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">zeros_like</span><span class="p">(</span><span class="n">x_gpu</span><span class="p">)</span> </pre></div> </div> </div> <div class="section" id="use-pinned-memory-buffers"> <span id="cuda-memory-pinning"></span><h3>Use pinned memory buffers<a class="headerlink" href="#use-pinned-memory-buffers" title="Permalink to this headline">¶</a></h3> <p>Host to GPU copies are much faster when they originate from pinned (page-locked) memory. CPU tensors and storages expose a <a class="reference internal" href="../tensors.html#torch.Tensor.pin_memory" title="torch.Tensor.pin_memory"><code class="xref py py-meth docutils literal notranslate"><span class="pre">pin_memory()</span></code></a> method, that returns a copy of the object, with data put in a pinned region.</p> <p>Also, once you pin a tensor or storage, you can use asynchronous GPU copies. Just pass an additional <code class="docutils literal notranslate"><span class="pre">non_blocking=True</span></code> argument to a <a class="reference internal" href="../tensors.html#torch.Tensor.to" title="torch.Tensor.to"><code class="xref py py-meth docutils literal notranslate"><span class="pre">to()</span></code></a> or a <a class="reference internal" href="../tensors.html#torch.Tensor.cuda" title="torch.Tensor.cuda"><code class="xref py py-meth docutils literal notranslate"><span class="pre">cuda()</span></code></a> call. This can be used to overlap data transfers with computation.</p> <p>You can make the <a class="reference internal" href="../data.html#torch.utils.data.DataLoader" title="torch.utils.data.DataLoader"><code class="xref py py-class docutils literal notranslate"><span class="pre">DataLoader</span></code></a> return batches placed in pinned memory by passing <code class="docutils literal notranslate"><span class="pre">pin_memory=True</span></code> to its constructor.</p> </div> <div class="section" id="use-nn-parallel-distributeddataparallel-instead-of-multiprocessing-or-nn-dataparallel"> <span id="cuda-nn-ddp-instead"></span><h3>Use nn.parallel.DistributedDataParallel instead of multiprocessing or nn.DataParallel<a class="headerlink" href="#use-nn-parallel-distributeddataparallel-instead-of-multiprocessing-or-nn-dataparallel" title="Permalink to this headline">¶</a></h3> <p>Most use cases involving batched inputs and multiple GPUs should default to using <a class="reference internal" href="../generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel" title="torch.nn.parallel.DistributedDataParallel"><code class="xref py py-class docutils literal notranslate"><span class="pre">DistributedDataParallel</span></code></a> to utilize more than one GPU.</p> <p>There are significant caveats to using CUDA models with <a class="reference internal" href="../multiprocessing.html#module-torch.multiprocessing" title="torch.multiprocessing"><code class="xref py py-mod docutils literal notranslate"><span class="pre">multiprocessing</span></code></a>; unless care is taken to meet the data handling requirements exactly, it is likely that your program will have incorrect or undefined behavior.</p> <p>It is recommended to use <a class="reference internal" href="../generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel" title="torch.nn.parallel.DistributedDataParallel"><code class="xref py py-class docutils literal notranslate"><span class="pre">DistributedDataParallel</span></code></a>, instead of <a class="reference internal" href="../generated/torch.nn.DataParallel.html#torch.nn.DataParallel" title="torch.nn.DataParallel"><code class="xref py py-class docutils literal notranslate"><span class="pre">DataParallel</span></code></a> to do multi-GPU training, even if there is only a single node.</p> <p>The difference between <a class="reference internal" href="../generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel" title="torch.nn.parallel.DistributedDataParallel"><code class="xref py py-class docutils literal notranslate"><span class="pre">DistributedDataParallel</span></code></a> and <a class="reference internal" href="../generated/torch.nn.DataParallel.html#torch.nn.DataParallel" title="torch.nn.DataParallel"><code class="xref py py-class docutils literal notranslate"><span class="pre">DataParallel</span></code></a> is: <a class="reference internal" href="../generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel" title="torch.nn.parallel.DistributedDataParallel"><code class="xref py py-class docutils literal notranslate"><span class="pre">DistributedDataParallel</span></code></a> uses multiprocessing where a process is created for each GPU, while <a class="reference internal" href="../generated/torch.nn.DataParallel.html#torch.nn.DataParallel" title="torch.nn.DataParallel"><code class="xref py py-class docutils literal notranslate"><span class="pre">DataParallel</span></code></a> uses multithreading. By using multiprocessing, each GPU has its dedicated process, this avoids the performance overhead caused by GIL of Python interpreter.</p> <p>If you use <a class="reference internal" href="../generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel" title="torch.nn.parallel.DistributedDataParallel"><code class="xref py py-class docutils literal notranslate"><span class="pre">DistributedDataParallel</span></code></a>, you could use <cite>torch.distributed.launch</cite> utility to launch your program, see <a class="reference internal" href="../distributed.html#distributed-launch"><span class="std std-ref">Third-party backends</span></a>.</p> </div> </div> </div> </article> </div> <footer> <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation"> <a href="ddp.html" class="btn btn-neutral float-right" title="Distributed Data Parallel" accesskey="n" rel="next">Next <img src="../_static/images/chevron-right-orange.svg" class="next-page"></a> <a href="cpu_threading_torchscript_inference.html" class="btn btn-neutral" title="CPU threading and TorchScript inference" accesskey="p" rel="prev"><img src="../_static/images/chevron-right-orange.svg" class="previous-page"> Previous</a> </div> <hr> <div role="contentinfo"> <p> © Copyright 2019, Torch Contributors. </p> </div> <div> Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/rtfd/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>. </div> </footer> </div> </div> <div class="pytorch-content-right" id="pytorch-content-right"> <div class="pytorch-right-menu" id="pytorch-right-menu"> <div class="pytorch-side-scroll" id="pytorch-side-scroll-right"> <ul> <li><a class="reference internal" href="#">CUDA semantics</a><ul> <li><a class="reference internal" href="#tensorfloat-32-tf32-on-ampere-devices">TensorFloat-32(TF32) on Ampere devices</a></li> <li><a class="reference internal" href="#asynchronous-execution">Asynchronous execution</a><ul> <li><a class="reference internal" href="#cuda-streams">CUDA streams</a></li> </ul> </li> <li><a class="reference internal" href="#memory-management">Memory management</a></li> <li><a class="reference internal" href="#cufft-plan-cache">cuFFT plan cache</a></li> <li><a class="reference internal" href="#best-practices">Best practices</a><ul> <li><a class="reference internal" href="#device-agnostic-code">Device-agnostic code</a></li> <li><a class="reference internal" href="#use-pinned-memory-buffers">Use pinned memory buffers</a></li> <li><a class="reference internal" href="#use-nn-parallel-distributeddataparallel-instead-of-multiprocessing-or-nn-dataparallel">Use nn.parallel.DistributedDataParallel instead of multiprocessing or nn.DataParallel</a></li> </ul> </li> </ul> </li> </ul> </div> </div> </div> </section> </div> <script type="text/javascript" id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script> <script src="../_static/jquery.js"></script> <script src="../_static/underscore.js"></script> <script src="../_static/doctools.js"></script> <script src="../_static/language_data.js"></script> <script type="text/javascript" src="../_static/js/vendor/popper.min.js"></script> <script type="text/javascript" src="../_static/js/vendor/bootstrap.min.js"></script> <script src="https://cdnjs.cloudflare.com/ajax/libs/list.js/1.5.0/list.min.js"></script> <script type="text/javascript" src="../_static/js/theme.js"></script> <script type="text/javascript"> jQuery(function () { SphinxRtdTheme.Navigation.enable(true); }); </script> <script> (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) })(window,document,'script','https://www.google-analytics.com/analytics.js','ga'); ga('create', 'UA-90545585-1', 'auto'); ga('send', 'pageview'); </script> <script async src="https://www.googletagmanager.com/gtag/js?id=UA-117752657-2"></script> <script> window.dataLayer = window.dataLayer || []; function gtag(){dataLayer.push(arguments);} gtag('js', new Date()); gtag('config', 'UA-117752657-2'); </script> <img height="1" width="1" style="border-style:none;" alt="" src="https://www.googleadservices.com/pagead/conversion/795629140/?label=txkmCPmdtosBENSssfsC&guid=ON&script=0"/> <!-- Begin Footer --> <div class="container-fluid docs-tutorials-resources" id="docs-tutorials-resources"> <div class="container"> <div class="row"> <div class="col-md-4 text-center"> <h2>Docs</h2> <p>Access comprehensive developer documentation for PyTorch</p> <a class="with-right-arrow" href="https://pytorch.org/docs/stable/index.html">View Docs</a> </div> <div class="col-md-4 text-center"> <h2>Tutorials</h2> <p>Get in-depth tutorials for beginners and advanced developers</p> <a class="with-right-arrow" href="https://pytorch.org/tutorials">View Tutorials</a> </div> <div class="col-md-4 text-center"> <h2>Resources</h2> <p>Find development resources and get your questions answered</p> <a class="with-right-arrow" href="https://pytorch.org/resources">View Resources</a> </div> </div> </div> </div> <footer class="site-footer"> <div class="container footer-container"> <div class="footer-logo-wrapper"> <a href="https://pytorch.org/" class="footer-logo"></a> </div> <div class="footer-links-wrapper"> <div class="footer-links-col"> <ul> <li class="list-title"><a href="https://pytorch.org/">PyTorch</a></li> <li><a href="https://pytorch.org/get-started">Get Started</a></li> <li><a href="https://pytorch.org/features">Features</a></li> <li><a href="https://pytorch.org/ecosystem">Ecosystem</a></li> <li><a href="https://pytorch.org/blog/">Blog</a></li> <li><a href="https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md">Contributing</a></li> </ul> </div> <div class="footer-links-col"> <ul> <li class="list-title"><a href="https://pytorch.org/resources">Resources</a></li> <li><a href="https://pytorch.org/tutorials">Tutorials</a></li> <li><a href="https://pytorch.org/docs/stable/index.html">Docs</a></li> <li><a href="https://discuss.pytorch.org" target="_blank">Discuss</a></li> <li><a href="https://github.com/pytorch/pytorch/issues" target="_blank">Github Issues</a></li> <li><a href="https://pytorch.org/assets/brand-guidelines/PyTorch-Brand-Guidelines.pdf" target="_blank">Brand Guidelines</a></li> </ul> </div> <div class="footer-links-col follow-us-col"> <ul> <li class="list-title">Stay Connected</li> <li> <div id="mc_embed_signup"> <form action="https://twitter.us14.list-manage.com/subscribe/post?u=75419c71fe0a935e53dfa4a3f&id=91d0dccd39" method="post" id="mc-embedded-subscribe-form" name="mc-embedded-subscribe-form" class="email-subscribe-form validate" target="_blank" novalidate> <div id="mc_embed_signup_scroll" class="email-subscribe-form-fields-wrapper"> <div class="mc-field-group"> <label for="mce-EMAIL" style="display:none;">Email Address</label> <input type="email" value="" name="EMAIL" class="required email" id="mce-EMAIL" placeholder="Email Address"> </div> <div id="mce-responses" class="clear"> <div class="response" id="mce-error-response" style="display:none"></div> <div class="response" id="mce-success-response" style="display:none"></div> </div> <!-- real people should not fill this in and expect good things - do not remove this or risk form bot signups--> <div style="position: absolute; left: -5000px;" aria-hidden="true"><input type="text" name="b_75419c71fe0a935e53dfa4a3f_91d0dccd39" tabindex="-1" value=""></div> <div class="clear"> <input type="submit" value="" name="subscribe" id="mc-embedded-subscribe" class="button email-subscribe-button"> </div> </div> </form> </div> </li> </ul> <div class="footer-social-icons"> <a href="https://www.facebook.com/pytorch" target="_blank" class="facebook"></a> <a href="https://twitter.com/pytorch" target="_blank" class="twitter"></a> <a href="https://www.youtube.com/pytorch" target="_blank" class="youtube"></a> </div> </div> </div> </div> </footer> <div class="cookie-banner-wrapper"> <div class="container"> <p class="gdpr-notice">To analyze traffic and optimize your experience, we serve cookies on this site. By clicking or navigating, you agree to allow our usage of cookies. As the current maintainers of this site, Facebook’s Cookies Policy applies. Learn more, including about available controls: <a href="https://www.facebook.com/policies/cookies/">Cookies Policy</a>.</p> <img class="close-button" src="../_static/images/pytorch-x.svg"> </div> </div> <!-- End Footer --> <!-- Begin Mobile Menu --> <div class="mobile-main-menu"> <div class="container-fluid"> <div class="container"> <div class="mobile-main-menu-header-container"> <a class="header-logo" href="https://pytorch.org/" aria-label="PyTorch"></a> <a class="main-menu-close-button" href="#" data-behavior="close-mobile-menu"></a> </div> </div> </div> <div class="mobile-main-menu-links-container"> <div class="main-menu"> <ul> <li> <a href="https://pytorch.org/get-started">Get Started</a> </li> <li> <a href="https://pytorch.org/features">Features</a> </li> <li> <a href="https://pytorch.org/ecosystem">Ecosystem</a> </li> <li> <a href="https://pytorch.org/mobile">Mobile</a> </li> <li> <a href="https://pytorch.org/hub">PyTorch Hub</a> </li> <li> <a href="https://pytorch.org/blog/">Blog</a> </li> <li> <a href="https://pytorch.org/tutorials">Tutorials</a> </li> <li class="active"> <a href="https://pytorch.org/docs/stable/index.html">Docs</a> </li> <li> <a href="https://pytorch.org/resources">Resources</a> </li> <li> <a href="https://github.com/pytorch/pytorch">Github</a> </li> </ul> </div> </div> </div> <!-- End Mobile Menu --> <script type="text/javascript" src="../_static/js/vendor/anchor.min.js"></script> <script type="text/javascript"> $(document).ready(function() { mobileMenu.bind(); mobileTOC.bind(); pytorchAnchors.bind(); sideMenus.bind(); scrollToAnchor.bind(); highlightNavigation.bind(); mainMenuDropdown.bind(); filterTags.bind(); // Add class to links that have code blocks, since we cannot create links in code blocks $("article.pytorch-article a span.pre").each(function(e) { $(this).closest("a").addClass("has-code"); }); }) </script> </body> </html>