.gitattributes CHANGED
@@ -18,7 +18,6 @@
18
  *.ot filter=lfs diff=lfs merge=lfs -text
19
  *.parquet filter=lfs diff=lfs merge=lfs -text
20
  *.pb filter=lfs diff=lfs merge=lfs -text
21
- *.pdf filter=lfs diff=lfs merge=lfs -text
22
  *.pickle filter=lfs diff=lfs merge=lfs -text
23
  *.pkl filter=lfs diff=lfs merge=lfs -text
24
  *.png filter=lfs diff=lfs merge=lfs -text
 
18
  *.ot filter=lfs diff=lfs merge=lfs -text
19
  *.parquet filter=lfs diff=lfs merge=lfs -text
20
  *.pb filter=lfs diff=lfs merge=lfs -text
 
21
  *.pickle filter=lfs diff=lfs merge=lfs -text
22
  *.pkl filter=lfs diff=lfs merge=lfs -text
23
  *.png filter=lfs diff=lfs merge=lfs -text
The_Ultra-Scale_Playbook_Training_LLMs_on_GPU_Clusters.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:274a19a2577ed220cd3a102b4469c44310e4a7c8e8f8ebc36842d907cb51e127
3
- size 14059172
 
 
 
 
assets/images/256px-PDF.png DELETED

Git LFS Details

  • SHA256: 48b7ab9362d78d22ca0d66b2943406759e85cffb86b585176990035d12ac2c7d
  • Pointer size: 129 Bytes
  • Size of remote file: 5.46 kB
dist/assets/.DS_Store DELETED
Binary file (6.15 kB)
 
dist/assets/images/256px-PDF.png DELETED

Git LFS Details

  • SHA256: 38ba9d71a429465ee0b469b43dd0969790e9cfe72c02857f31a75412b3c9e81e
  • Pointer size: 129 Bytes
  • Size of remote file: 1.25 kB
dist/bibliography.bib CHANGED
@@ -488,7 +488,7 @@ url = {https://github.com/meta-llama/llama3/blob/main/MODEL_CARD.md}
488
  @software{torchao,
489
  title = {torchao: PyTorch native quantization and sparsity for training and inference},
490
  author = {torchao maintainers and contributors},
491
- url = {https://github.com/pytorch/ao},
492
  license = {BSD-3-Clause},
493
  month = oct,
494
  year = {2024}
 
488
  @software{torchao,
489
  title = {torchao: PyTorch native quantization and sparsity for training and inference},
490
  author = {torchao maintainers and contributors},
491
+ url = {https://github.com/pytorch/torchao},
492
  license = {BSD-3-Clause},
493
  month = oct,
494
  year = {2024}
dist/distill.bundle.js CHANGED
@@ -2146,7 +2146,7 @@ function _arrayWithHoles(r) { if (Array.isArray(r)) return r; }
2146
  function bylineTemplate(frontMatter) {
2147
  return "\n <div class=\"byline grid\">\n <div>\n <h3>Authors</h3>\n <div>\n ".concat(frontMatter.authors.map(function (author, i) {
2148
  return "\n <span class=\"author\">\n ".concat(author.personalURL ? "\n <a class=\"name\" href=\"".concat(author.personalURL, "\">").concat(author.name) + (i + 1 < frontMatter.authors.length ? "," : "") + "</a>" : "\n <span class=\"name\">".concat(author.name) + (i + 1 < frontMatter.authors.length ? "," : "") + "</span>", "\n </span>\n ");
2149
- }).join(''), "\n </div>\n </div>\n <div >\n <h3>Affiliation</h3>\n <div><a href=\"https://huggingface.co/\">Hugging Face</a>\n </div>\n </div>\n <div >\n <h3>Published</h3>\n <div>Feb 19, 2025</div>\n </div>\n </div>\n <div class=\"side pdf-download\">\n <a href=\"https://huggingface.co/spaces/nanotron/ultrascale-playbook/resolve/main/The_Ultra-Scale_Playbook_Training_LLMs_on_GPU_Clusters.pdf\">Download PDF\n <br>\n <img style=\"width: 32px;\" src=\"../assets/images/256px-PDF.png\" alt=\"PDF\"></a>\n \n </div>\n");
2150
  }
2151
  var Byline = /*#__PURE__*/function (_HTMLElement4) {
2152
  function Byline() {
 
2146
  function bylineTemplate(frontMatter) {
2147
  return "\n <div class=\"byline grid\">\n <div>\n <h3>Authors</h3>\n <div>\n ".concat(frontMatter.authors.map(function (author, i) {
2148
  return "\n <span class=\"author\">\n ".concat(author.personalURL ? "\n <a class=\"name\" href=\"".concat(author.personalURL, "\">").concat(author.name) + (i + 1 < frontMatter.authors.length ? "," : "") + "</a>" : "\n <span class=\"name\">".concat(author.name) + (i + 1 < frontMatter.authors.length ? "," : "") + "</span>", "\n </span>\n ");
2149
+ }).join(''), "\n </div>\n </div>\n <div >\n <h3>Affiliation</h3>\n <div><a href=\"https://huggingface.co/\">Hugging Face</a>\n </div>\n </div>\n <div >\n <h3>Published</h3>\n <div>Feb 19, 2025</div>\n </div>\n </div>\n");
2150
  }
2151
  var Byline = /*#__PURE__*/function (_HTMLElement4) {
2152
  function Byline() {
dist/distill.bundle.js.map CHANGED
The diff for this file is too large to render. See raw diff
 
dist/index.html CHANGED
@@ -73,25 +73,25 @@
73
  </d-contents>
74
 
75
  <p>
76
- Thousands of GPUs humming in perfect harmony. That's what it takes to train today's most powerful AI models – a symphony of computing power that until recently was the exclusive domain of elite research labs. Open source has transformed this landscape, but not completely. Yes, you can download the latest <a href="https://huggingface.co/meta-llama">Llama</a> or <a href="https://huggingface.co/deepseek-ai">DeepSeek</a> models. Yes, you can read their <a href="https://ai.meta.com/research/publications/the-llama-3-herd-of-models/">technical</a> and <a href="https://github.com/deepseek-ai/DeepSeek-R1/blob/main/DeepSeek_R1.pdf">experiment</a> reports. But the most challenging part – the training code, the knowledge and techniques necessary to coordinate GPUs to train these massive systems – remains shrouded in complexity and spread around a series of disconnected papers and often private codebases.
77
  </p>
78
- <aside>Reading time: 2-4 days. <br>For the best reading experience, we recommend not using a mobile phone.</aside>
79
  <p>
80
- This open-source book is here to change that. Starting from the basics, we'll walk you through the knowledge necessary to scale the training of large language models from one GPU to tens, hundreds and even thousands of GPUs, illustrating theory with practical code examples and reproducible benchmarks.
81
  </p>
82
 
83
- <p>As the size of the clusters used to train these models grew, various techniques such as data parallelism, tensor parallelism, pipeline parallelism or context parallelism as well as ZeRO or kernel fusion have been invented to makes sure that GPUs are highly utilized at all times. This significantly reduces training time and makes the best use of this expensive hardware. Even more, as the challenge of scaling up AI training goes beyond just building the initial models and teams have found that fine-tuning large models on specialized data often produces the best results, generally involving the same distributed training techniques. In this book we'll progressively go over all of these techniques –from the simplest to the most refined ones– while keeping a single story-line to understand where each method comes from.</p>
84
 
85
  <aside>If you have questions or remarks open a discussion on the <a href="https://huggingface.co/spaces/nanotron/ultrascale-playbook/discussions?status=open&type=discussion">Community tab</a>!</aside>
86
 
87
- <p>We'll assume you have some simple basic knowledge about current LLM architecture and are roughtly familiar with how deep learning model are trained, but you can be generally new to distributed training. If needed, the basics of model training can be found in great courses found at <a href="https://www.deeplearning.ai">DeepLearning.ai</a> or on the <a href="https://pytorch.org/tutorials/beginner/basics/intro.html">PyTorch tutorial sections</a>. This book can be seen as the second part of a trilogy following our first blog on processing data for pre-training, the so-called β€œ<a href="https://huggingface.co/spaces/HuggingFaceFW/blogpost-fineweb-v1">FineWeb blog post</a>”. Having read both blog posts, you should have almost all the core knowledge needed to fully understand how how performing LLMs are being built nowadays, just missing some final spices regarding data mixing and architecture choices to complete the recipe (stay tuned for part three…).</p>
88
 
89
  <aside>We are extremely thankful to the whole <a href="https://distill.pub/">distill.pub</a> team for creating
90
  the template on which we based this blog post.</aside>
91
 
92
  <p>The book is built on the following <strong>three general foundations</strong>:</p>
93
 
94
- <p><strong>Quick intros on theory and concepts:</strong> before diving into code and experiments, we want to understand how each method works at a high level and what its advantages and limits are. You’ll learn about which parts of a language model eat away your memory and when during training it happens. You’ll learn how we can solve memory constraints by parallelizing the models and increase the throughput by scaling up GPUs. As a result you'll understand how the following widget to compute the memory breakdown of a transformer model works: </p>
95
  <aside>Note that we're still missing Pipeline Parallelism in this widget. To be added as an exercise for the reader.</aside>
96
 
97
  <div class="large-image-background-transparent">
@@ -268,7 +268,7 @@
268
  <ol>
269
  <li><strong>Memory Usage</strong>: it's a hard limitation - if a training step doesn't fit in memory, training cannot proceed</li>
270
  <li><strong>Compute Efficiency</strong>: we want our hardware to spend most time computing, so we need to reduce time spent on data transfers or waiting for other GPUs to perform work.</li>
271
- <li><strong>Communication overhead</strong>: we want to minimize communication overhead as it keeps GPUs idle. To achieve this we will try to make best use of intra-node (fast) and inter-node (slower) bandwidths as well as overlap communication with compute as much as possible.</li>
272
  </ol>
273
  <p>In many places we'll see that we can trade one of these (computation, communication, memory) for another (e.g. recomputation or Tensor Parallelism). Finding the right balance is key to scaling training.</p>
274
  <p>
@@ -341,7 +341,7 @@
341
 
342
  <aside>For instance, during DeepSeek-V3/R1 training β€œthe batch size is gradually increased from 3072 input sequences to 15360 in the training of the first 469B tokens, and then keeps at 15360 input samples in the remaining training”.</aside>
343
 
344
- <p>Batch size also affects the time it takes to train on a given text dataset: a small batch size will require more optimizer steps to train on the same amount of samples. Optimizer steps are costly (in compute time) and the total time to train will thus increase compared to using a larger batch size. This being said, note that the batch size can often be adjusted quite largely around the optimal batch size without major impact on the performance of the model, i.e. the sensitivity of final model performances to the exact batch size value is usually rather low around the optimal batch size.</p>
345
 
346
  <p>In the LLM pretraining community, batch sizes are commonly reported in terms of tokens rather than in number of samples (<d-math>bst</d-math> = Batch Size Tokens), this makes training numbers generally independent of the exact input sequence length used during the training.</p>
347
 
@@ -353,7 +353,7 @@
353
 
354
  <p>From here onward we’ll show the formulas for the batch size in terms of samples but you can always get its token-unit counterpart by multiplying it with the sequence length.</p>
355
 
356
- <p>A sweet spot for recent LLM training is typically on the order of 4-60 million tokens per batch. The batch size as well as the training corpus have been steadily increasing over the years: Llama 1 was trained with a batch size of ~4M tokens for 1.4 trillion tokens while DeepSeek was trained with a batch size of ~60M tokens for 14 trillion tokens.</p>
357
 
358
  <p><strong>And our first challenge is already coming ahead when scaling the training of our model to these large batch sizes: out-of-memory issues. What should we do when our GPU doesn’t have enough memory to hold a full batch of our target batch size?</strong></p>
359
 
@@ -361,7 +361,7 @@
361
 
362
  <h3>Memory usage in Transformers</h3>
363
 
364
- <p>When training a neural network model, one stores several items in memory:</p>
365
 
366
  <ul>
367
  <li>Model weights</li>
@@ -374,7 +374,7 @@
374
  <p class="note-box-title">πŸ“ Note</p>
375
  <div class="note-box-content">
376
  <p>
377
- You would think for a model you could compute the memory requirements exactly but there are a few additional memory occupants that make it hard to be exact:
378
  <ul>
379
  <li>CUDA Kernels typically require 1-2 GB of GPU memory, which you can quickly verify by running <code>import torch; torch.ones((1, 1)).to("cuda")</code> and then checking the GPU memory with <code>nvidia-smi</code>.</li>
380
  <li>Some rest memory usage from buffers, intermediate results and some memory that can’t be used due to fragmentation</li>
@@ -385,11 +385,11 @@
385
 
386
  <p>These items are stored as tensors which come in different <em>shapes</em> and <em>precisions</em>. The <em>shapes</em> are determined by hyper-parameters such as batch size, sequence length, model hidden dimensions, attention heads, vocabulary size, and potential model sharding as we’ll see later. <em>Precision</em> refers to formats like FP32, BF16, or FP8, which respectively require 4, 2, or 1 byte to store each single value in the tensor. We will have a full discussion of the different precisions and their trade-offs in the <a target="_self" href="#mixed_precision_training">Mixed Precision Training</a> section, for now let's just keep in mind that the memory requirements for these various format will be different and that will impact the memory usage of the items we need to store.</p>
387
 
388
- <p>So how can I quickly determine memory usage from these variables? One simple way is to do this empirically and just measure it.</p>
389
 
390
  <h4>Profiling the memory usage</h4>
391
 
392
- <p>Using the Pytorch profiler we can understand how memory is allocated throughout training. We can see that memory utilization is not a static thing but varies a lot during training and during a training step:</p>
393
 
394
  <aside>Check out <a target="_self" href="#a1%3A_distributed_training_profiling" class="">A1: Distributed Training Profiling</a> for a walkthrough how to profile your model.</aside>
395
 
@@ -434,7 +434,7 @@
434
  \end{aligned}
435
  </d-math>
436
 
437
- <p>Now let’s have look how things change if we use a lower precision. For stability reasons (see <a target="_self" href="#mixed_precision_training">the mixed-precision training section below</a>) we often don't use full low precision training but a mix of higher and lower precision called "mixed precision"<d-cite bibtex-key="micikevicius2018mixedprecisiontraining"></d-cite>. The default nowadays for mixed precision training is to generally use BF16 for most of the computations –requiring 2 bytes per parameter and gradient– as well as an additional copy of the model weights and gradients in FP32, thus 12 bytes per parameter in total. In addition to the parameters and gradient, we need to store the optimizer states: for the Adam optimizer, this requires the momentum and the variance usually stored in FP32 for numerical stability, each using 4 bytes. </p>
438
 
439
  <aside>See some more details below when we cover the ZeRO methods.</aside>
440
 
@@ -504,7 +504,7 @@
504
 
505
  <p>As we can see, as soon as we reach <strong>7B</strong> (!), weights and optimizer requirements already starts to add up significantly and exceed the size of a typical GPU memory, e.g. 80GB for a H100 GPU.</p>
506
 
507
- <p>But for now, let’s start with models which still fit in a single GPU, take a look at the last big contributor to our memory budget: the activation memory.</p>
508
 
509
  <h4>Activations memory</h4>
510
 
@@ -518,7 +518,7 @@
518
 
519
  <p>For the exact derivation of the numbers, you can follow this original NVIDIA paper on recomputation <d-cite bibtex-key="korthikanti2022recomputation"></d-cite>, it essentially requires you to do some accounting of all the sizes of intermediate activations between each operation in a transformer layer.</p>
520
 
521
- <p>An interesting observation here is that memory usage is not static for a given model; rather, it scales linearly with the batch size and quadratically with the sequence length. This means the activation memory is the part which will blow up when we increase our batch size or train with longer sequences. We can use this equation to look at how memory usage changes for various sequence lengths for example for Llama models (<code>bs=1</code>):</p>
522
 
523
  <div class="l-body-outset" id="fragment-memusage_activations"></div>
524
  <!-- <script>
@@ -539,7 +539,7 @@
539
 
540
  <h3>Activation recomputation</h3>
541
 
542
- <p>The general idea behind <strong><em>activation recomputation</em></strong> – also called <em>gradient checkpointing</em> or <em>rematerialization</em> – is to discard some activations during the forward pass to save memory and spend some extra compute to recompute these on the fly during the backward pass. Without recomputation, we store every hidden state between two learnable operations (e.g. feed-forward, layernorm etc.), such that we can use them during the backward pass to compute gradients. When we use recomputation we typically will only store activations at a few key points along the model architecture, discard the rest of activations and recompute them on the fly during the backward pass from the nearest saved activations, basically performing again a sub-part of the forward pass to trade off memory for compute. It generally looks like this:</p>
543
 
544
  <div class="svg-container" id="svg-activation_recomputation"> </div>
545
  <div class="info" id="svg-activation_recomputation-info">Hover over the network elements to see their details</div>
@@ -548,7 +548,7 @@
548
 
549
  <ul>
550
  <li><strong>Full</strong>: We checkpoint activations at the transition point between each layer of the Transformer model. This is usually called the <code>full</code> strategy since it requires a forward pass through each layer essentially adding a full forward pass during the backward pass. This strategy saves the most memory but is the most expensive one in terms of compute. It generally increases the compute cost and time by up to 30-40% which is very noticeable.</li>
551
- <li><strong>Selective</strong>: In general we can do better than full. The authors of the recomputation paper<d-cite bibtex-key="korthikanti2022recomputation"></d-cite> did a detailed analysis studying which activations grow the largest and have the cheapest recomputation cost in terms of FLOPs. Turns out that the attention computations fall in that category, and thus we can usually discard them and focus on checkpointing the expensive feedforward computations. For a GPT-3 (175B) model this means <strong>70% activation memory reduction at a 2.7% compute cost</strong>.</li>
552
  </ul>
553
 
554
  <aside>In recent models like DeepSeek V3, selective checkpointing is performed, storing even a smaller size of attention activation β€”using so-called β€œMulti-Head Latent Attention” (MLA)– to optimize activation memory usage.</aside>
@@ -611,7 +611,7 @@
611
 
612
  <p>But if you’ve carefully followed, you probably noticed that the forward/backward passes for each micro-batch can actually be run in parallel. Forward/backward passes are independent from each other, with independent input samples being the only difference. Seems like it’s time to start extending our training to more than one GPU! </p>
613
 
614
- <p>Before that, let's quickly see how we can vizualise computation and communication with a short tour of one of the most useful tool in the distributed training toolbox: the <strong>profiler</strong>. This tool will be extremely useful to understand and validate how communications between GPUs and compute are happening and where bottlenecks are.</p>
615
 
616
  <h4>Profiling GPU compute and communication</h4>
617
 
@@ -802,7 +802,7 @@
802
 
803
  <p>While data parallelism nicely overlaps the all-reduce gradient synchronization with backward computation to save time, this benefit starts to break down at large scales. Why? Because as we add more and more GPUs (hundreds or thousands), the overhead of coordinating between them grows significantly and the network requirements are becoming too large for the benefits. As a result, our setup will become less and less efficient which each additional GPU we add to the system.</p>
804
 
805
- <p>Let's see this happening in practice with some benchmark:</p>
806
 
807
  <!-- <p><img alt="image.png" src="/assets/images/dp_scaling.svg"/></p> -->
808
  <div class="l-body-outset" id="fragment-dp_scaling"></div>
@@ -864,7 +864,7 @@
864
 
865
  <h4>Memory usage revisited</h4>
866
 
867
- <p>You likely remember from <a target="_self" href="#memory_usage_in_transformers"> our previous section</a> the memory usage of optimizer states, gradients, and parameters during a standard training. Let's call our model's parameters count <d-math>\Psi</d-math> (previously N but here we use the original ZeRO paper notation). In <a target="_self" href="#mixed_precision_training">Mixed Precision Training</a> (more details in a later section) with the Adam optimizer, the memory usage for each item we need to store is:</p>
868
 
869
  <ul>
870
  <li>Model’s parameters (half precision i.e. bf16/fp16): <d-math>2\Psi</d-math></li>
@@ -902,7 +902,7 @@
902
  </ul>
903
  <aside>Note: reduce-scatter is 2 times faster than all reduce! <em>Yay, a third communication primitive!</em></aside>
904
 
905
- <p>You may be wondering what is this "reduce-scatter" operation and how this all look so let's try to make this more graphical with the figure below. We'll go over all the steps of a forward/backward pass cycle:</p>
906
 
907
  <p><img alt="dp_zero1.gif" src="/assets/images/dp_zero1.gif" /></p>
908
 
@@ -1320,6 +1320,7 @@
1320
  <ul>
1321
  <li>for both methods we notice the biggest performance drop when we move from TP=8 to TP=16, because that’s when we move from only communicating within a single node (NVLink), to communicating inter-nodes (EFA)</li>
1322
  <li>the memory savings in activations when using TP with SP helps us fit far bigger batches than TP alone</li>
 
1323
  </ul>
1324
 
1325
  <p><strong>We have seen how TP helps us shard activations across several GPUs by splitting the attention and feedforward operations along the hidden dimension and how SP is a natural complement for the remaining operations by splitting along the sequence dimension.</strong></p>
@@ -1353,9 +1354,9 @@
1353
 
1354
  <!-- <p><img alt="image.png" src="/assets/images/cp_memoryusage.svg" /></p> -->
1355
 
1356
- <p>The core idea of Context Parallelism is to apply a similar idea to the Sequence Parallelism approach (aka to split along the sequence length) but to the modules where we already apply Tensor Parallelism. We will thus split these modules along two dimensions, thereby also reducing the effect of sequence length. You will find this approach quite intuitive after all we’ve already convered but... there is a trick to it so stay awake!</p>
1357
 
1358
- <p>For Context Parallelism; just like Sequence Parallelism, we’ll split the input along the sequence dimension but we now apply this splitting along the full model, instead of only the sequence parallel regions of the model as we’ve done previously with Tensor + Sequence Parallelism.</p>
1359
 
1360
  <!-- <p><img alt="cp_8Bmemoryusage.svg" src="/assets/images/cp_8Bmemoryusage.svg" /></p>
1361
  -->
@@ -1363,7 +1364,7 @@
1363
 
1364
  <p>There is one important exception though as we we need to pay particular attention to the <strong>Attention blocks</strong> (haha.. pun intended :D). In the attention module each token needs to access key/value pairs from <strong>all</strong> other sequence tokens or in the case of causal attention at least attends to each previous token.</p>
1365
 
1366
- <p>Because Context Parallelism splits the inputs along the sequence dimension across GPUs, the attention module will require full communication between GPUs to exchange the necessary key/value data.</p>
1367
 
1368
  <p>That sounds very expensive if we do it naively. Is there a way to do this rather efficiently and fast! Thankfully there is: a core technique to handle this communication of key/value pairs efficiently is called <em>Ring Attention</em>.</p>
1369
 
@@ -1503,7 +1504,7 @@
1503
  </div>
1504
  <p>The remaining idle time is indicated in grey and usually called the β€œbubble” and the sight of this probably break your heart after we spent so much time optimizing throughput.</p>
1505
 
1506
- <p>We can quantify how efficient a pipeline setup is by looking at how much time we lose because of the bubble. Let’s say <d-math>t_f</d-math> and <d-math>t_b</d-math> are the times for the forward and backward pass, respectively, as measured for one microbatch and one stage of the pipeline (a simple assumption is often to have <d-math>t_b \approx 2 \times t_f</d-math> which you can see on the above graph). If we could perfectly parallelize the ideal total time would be <d-math>t_{id}=t_f + t_b</d-math>. However, we can count on the graph that due to the pipeline bubble there is additional time of <d-math>t_{pb}=(p-1)*(t_f+t_b)</d-math> (where <d-math>p</d-math> is the degree of pipeline parallelism, i.e the number of GPU on the above graph) ie. the time each GPU is waiting while other GPUs are computing.</p>
1507
 
1508
  <p>We can compute the ratio of the additional bubble time over the ideal time:
1509
  </p>
@@ -1591,7 +1592,7 @@
1591
 
1592
  <p><img alt="pp_1f1b_interleaved.svg" src="/assets/images/pp_1f1b_interleaved.svg" /></p>
1593
 
1594
- <div class="figure-legend"><p>An example of interleaved pipeline parallelism for a model with layers distributed across 4 GPUs. Numbers still correspond to the microbatches IDs but for clarity we've colored differently the first and the last layers of the model to illustrate how layers are spread across GPUs.</p>
1595
  </div>
1596
 
1597
  <p>As a consequence we see additional communications happening as the model goes several times through each GPU for the same computation that previously just took one pass. However, each forward and backward pass is divided by a factor of <d-math>v</d-math>, where <d-math>v</d-math> is the number of stages or model chunks per GPUs as we are able to better interleave forward and backward passes. </p>
@@ -1745,7 +1746,7 @@
1745
  </table>
1746
  </div>
1747
 
1748
- <p>As you can see, ZeRO-3 and PP solve the same challenge but involve different approaches and the choice between both will depend whether you decide to focus communication either on weights or on activations. While they can be combined, it's not often done in practice as doing so requires increasing the global batch size significantly to amortize the communication costs, creating a tradeoff between global batch size, model size, network bandwidth, and training efficiency. If you decide to combine them, ZeRO-3 should be configured to keep the weights in memory during the series of PP micro-batches to minimize as much as possible un-necessary communication overhead.</p>
1749
 
1750
  <p>On the other hand, ZeRO-1 and ZeRO-2, which focus on optimizer states and gradients, can be easily combined with Pipeline Parallelism and are complementary to it. Combining them don't raise any particular new challenge. For instance, the training of DeepSeek-v3 used PP combined with ZeRO-1 (sic).</p>
1751
 
@@ -1793,7 +1794,7 @@
1793
  <li>Tensor Parallelism (and Sequence Parallelism) affects computation throughout the entire model by sharding both weights and activations.</li>
1794
  <li>Context Parallelism primarily impacts attention layers since that's where cross-sequence communication is required, with other layers operating independently on sharded sequences.</li>
1795
  <li>Expert Parallelism primarly affects the MoE layers (which replace standard MLP blocks), leaving attention and other components unchanged</li>
1796
- <li>Pipeline Parallelism and ZeRO are not especially specific to any sub-module or component with the exception that modules and layers need to be balanced in Pipeline Parallelism, the first and last layers are thus often treated differently due to the additional embedding layers.</li>
1797
  </ul>
1798
 
1799
  <table>
@@ -1912,7 +1913,7 @@
1912
 
1913
  <h2>Finding the Best Training Configuration</h2>
1914
 
1915
- <p>We’ve now covered all the parallelism techniques that are actually used to distribute and train larger models as well as how and why they can be combined together. There remain a general question: which ones should we choose in the end and how to decide on a specific combination?</p>
1916
 
1917
  <p>We touched this a little bit in the previous section but let's now walk in details through a possible decision process, step by step, keeping in mind that you'll always have to run a few experiments to find the definitive optimal setup for your compute cluster given its various physical properties, network bandwidth, GPUs per node, memory per GPU, etc.</p>
1918
 
@@ -2015,7 +2016,7 @@
2015
 
2016
  <h3>Lessons learned on benchmarking</h3>
2017
 
2018
- <p>Our goal for this book was not only to discuss theory and implementations but provide actual data points as well. So the plan was simple: let's run every possible distributed configuration for every model and a number of cluster sizes (namely 1-64 nodes of 8xH100s). Even after excluding impossible configuration we still needed to run thousands of experiments. </p>
2019
 
2020
  <p>
2021
  On paper this sounds easy enough: we can easily launch big arrays of jobs on our cluster. However, as soon as we launched the first batches of experiments, troubles began:
@@ -2272,7 +2273,7 @@
2272
 
2273
  </ol>
2274
 
2275
- <p>Let’s talk about one of the most frequent technique we can use in CUDA: optimizing memory access. The global memory in GPUs (the largest memory in our above graph) has a long latency and low bandwidth in comparison to the cache which often creates a major bottleneck for most applications. Efficiently accessing data from global memory can improve performance by a lot.</p>
2276
 
2277
  <h4>Memory Coalescing</h4>
2278
 
@@ -2410,7 +2411,7 @@
2410
 
2411
  <p>So it seems warps are stalling waiting for shared memory accesses to return! To solve this issue we can apply a technique called <strong>Thread Coarsening</strong> which involves merging several threads into a single coarsened thread. This will significantly reduce shared memory accesses as each coarsened thread can handle multiple output elements.</p>
2412
 
2413
- <p>Let's briefly go through a last important consideration when writing or improving custom kernels: <strong>Minimizing Control Divergence</strong>.</p>
2414
 
2415
  <h4>Minimizing Control Divergence</h4>
2416
 
@@ -2565,13 +2566,13 @@
2565
 
2566
  <p>We can see that float32 spans 80 orders of magnitude and float16 sacrifices a lot of range while bfloat16 maintains the full range. The two float8 formats reduce the range even further where e5e2 can maintain float16 range and e4m3 has an even smaller ranger.</p>
2567
 
2568
- <p>How come some formats are able to maintain the range and others not? Let’s investigate the resolution by plotting 10,000 points between 1 and 2. Each point will be rounded to the nearest representable number in each format:</p>
2569
 
2570
  <p><img alt="image.png" src="/assets/images/mixedprecision_2.png" /></p>
2571
 
2572
  <p>We can see here that bfloat16 maintained the range of float32 over float16 but did this with the cost of sacrificing more precision. In case of float8 the situation is even more dire as e4m3 can represent 7 and e5m2 only 3 number on the interval 1-2.</p>
2573
 
2574
- <p>A common metric to measure a formats resolution is epsilon: the first representable number after <d-math>1.00</d-math>. We can see that for the float32 format <d-math>10^{-4}</d-math> is an upper bound (it’s actually <d-math>1.19^{-7}</d-math>). For float16 it is ~ <d-math>10^{-3}</d-math> and for bfloat 10x higher still.</p>
2575
 
2576
  <p>The idea of mixed precision training is to use some of these lower precisions formats while maintaining the performance of full precision training. </p>
2577
 
@@ -3046,7 +3047,7 @@
3046
  print(f"After broadcast on rank {dist.get_rank()}: {tensor}")
3047
 
3048
  init_process()
3049
- example_broadcast()
3050
  </d-code>
3051
 
3052
 
@@ -3379,7 +3380,7 @@
3379
 
3380
  <p>There are a few finer points in the decision tree that we leave to the reader to explore in the PyTorch guide referenced above.</p>
3381
 
3382
- <p>Now that we covered the fundamental operations for distributed training and you should now be ready to follow the blog post easily.</p>
3383
 
3384
  <h3>A1: Distributed Training Profiling</h3>
3385
 
@@ -3828,8 +3829,7 @@
3828
  }
3829
  if (level === 0)
3830
  ToC += '<div>' + link + '</div>';
3831
- else
3832
- // else if (level === 1)
3833
  ToC += '<li>' + link + '</li>';
3834
  }
3835
 
 
73
  </d-contents>
74
 
75
  <p>
76
+ Thousands of GPUs humming in perfect harmony. That's what it takes to train today's most powerful AI models – a symphony of computing power that until recently was the exclusive domain of elite research labs. Open source has transformed this landscape, but not completely. Yes, you can download the latest <a href="https://huggingface.co/meta-llama">Llama</a> or <a href="https://huggingface.co/deepseek-ai">DeepSeek</a> models. Yes, you can read their <a href="https://ai.meta.com/research/publications/the-llama-3-herd-of-models/">technical</a> and <a href="https://github.com/deepseek-ai/DeepSeek-R1/blob/main/DeepSeek_R1.pdf">experiment</a> reports. But the most challenging part – the training code, the knowledge and technics necessary to coordinate GPUs to train these massive systems – remains shrouded in complexity and spread around a series of disconnected papers and often private codebases.
77
  </p>
78
+ <aside>Reading time: 2-4 days. For the best reading experience, we recommend not using a mobile phone.</aside>
79
  <p>
80
+ This open-source book is here to changes that. Starting from the basics, we'll walk you through the knowledge necessary to scale the training of large language models from one GPU to tens, hundreds and even thousands of GPUs, illustrating theory with practical code examples and reproducible benchmarks.
81
  </p>
82
 
83
+ <p>As the size of the clusters used to train these models grew, various techniques such as data parallelism, tensor parallelism, pipeline parallelism or context parallelism as well as ZeRO or kernel fusion have been invented to makes sure that GPUs are highly utilized at all times. This significantly reduces training time and makes the best use of this expensive hardware. Even more, as the challenge of scaling up AI training goes beyond just building the initial models and teams have found that fine-tuning large models on specialized data often produces the best results, generally involving the same distributed training techniques. In this book we'll progressively go over all of these techniques –from the simplest to the most raffined one– while keeping a single story-line to understand where each method comes from.</p>
84
 
85
  <aside>If you have questions or remarks open a discussion on the <a href="https://huggingface.co/spaces/nanotron/ultrascale-playbook/discussions?status=open&type=discussion">Community tab</a>!</aside>
86
 
87
+ <p>We'll assumes you have some simple basic knowledge about current LLM architecture and are roughtly familiar with how deep learning model are trained, but you can be generally new to distributed training. If needed, the basics of model training can be found in great courses found at <a href="https://www.deeplearning.ai">DeepLearning.ai</a> or on the <a href="https://pytorch.org/tutorials/beginner/basics/intro.html">PyTorch tutorial sections</a>. This book can be seen as the second part of a trilogy following our first blog on processing data for pre-training, the so-called β€œ<a href="https://huggingface.co/spaces/HuggingFaceFW/blogpost-fineweb-v1">FineWeb blog post</a>”. Having read both blog posts, you should have almost all the core knowledge needed to fully understand how how performing LLMs are being built nowadays, just missing some final spices regarding data mixing and architecture choices to complete the recipe (stay tuned for part three…).</p>
88
 
89
  <aside>We are extremely thankful to the whole <a href="https://distill.pub/">distill.pub</a> team for creating
90
  the template on which we based this blog post.</aside>
91
 
92
  <p>The book is built on the following <strong>three general foundations</strong>:</p>
93
 
94
+ <p><strong>Quick intros on theory and concepts:</strong> before diving into code and experiments, we want to understand how each method works at a high level and what it’s advantages and limits are. You’ll learn about which parts of a language model eat away your memory and when during training it happens. You’ll learn how we can solve memory constraints by parallelizing the models and increase the throughput by scaling up GPUs. As a result you'll understand how the following widget to compute the memory breakdown of a transformer model works: </p>
95
  <aside>Note that we're still missing Pipeline Parallelism in this widget. To be added as an exercise for the reader.</aside>
96
 
97
  <div class="large-image-background-transparent">
 
268
  <ol>
269
  <li><strong>Memory Usage</strong>: it's a hard limitation - if a training step doesn't fit in memory, training cannot proceed</li>
270
  <li><strong>Compute Efficiency</strong>: we want our hardware to spend most time computing, so we need to reduce time spent on data transfers or waiting for other GPUs to perform work.</li>
271
+ <li><strong>Communication overhead</strong>: we want to minimize communication overhead as it keeps GPUs idle. To archieve this we will try to make best use of intra-node (fast) and inter-node (slower) bandwidths as well as overlap communication with compute as much as possible.</li>
272
  </ol>
273
  <p>In many places we'll see that we can trade one of these (computation, communication, memory) for another (e.g. recomputation or Tensor Parallelism). Finding the right balance is key to scaling training.</p>
274
  <p>
 
341
 
342
  <aside>For instance, during DeepSeek-V3/R1 training β€œthe batch size is gradually increased from 3072 input sequences to 15360 in the training of the first 469B tokens, and then keeps at 15360 input samples in the remaining training”.</aside>
343
 
344
+ <p>Batch size also affects the time it takes to train on a given text dataset: a small batch size will require more optimizer steps to train on the same amount of samples. Optimizer steps are costly (in compute time) and the total time to train will thus increase compared to using a larger batch size. This being said, note that the batch size can often be adjusted quite largely around the optimal batch size without major impact to the performance of the model, i.e. the sensitivity of final model performances to the exact batch size value is usually rather low around the optimal batch size.</p>
345
 
346
  <p>In the LLM pretraining community, batch sizes are commonly reported in terms of tokens rather than in number of samples (<d-math>bst</d-math> = Batch Size Tokens), this makes training numbers generally independent of the exact input sequence length used during the training.</p>
347
 
 
353
 
354
  <p>From here onward we’ll show the formulas for the batch size in terms of samples but you can always get its token-unit counterpart by multiplying it with the sequence length.</p>
355
 
356
+ <p>A sweet spot for recent LLM training is typically on the order of 4-60 million tokens per batch. The batch size as well as the training corpus have been steadily increasing over the years: Llama 1 was trained with a batch size of ~4M tokens for 1.4 trillions tokens while DeepSeek was trained with a batch size of ~60M tokens for 14 trillion tokens.</p>
357
 
358
  <p><strong>And our first challenge is already coming ahead when scaling the training of our model to these large batch sizes: out-of-memory issues. What should we do when our GPU doesn’t have enough memory to hold a full batch of our target batch size?</strong></p>
359
 
 
361
 
362
  <h3>Memory usage in Transformers</h3>
363
 
364
+ <p>When training a neural network model, one store several items in memory:</p>
365
 
366
  <ul>
367
  <li>Model weights</li>
 
374
  <p class="note-box-title">πŸ“ Note</p>
375
  <div class="note-box-content">
376
  <p>
377
+ You would think for a model you could compute the memory requirements exactly but there are a few additional memory occupants that makes it hard to be exact:
378
  <ul>
379
  <li>CUDA Kernels typically require 1-2 GB of GPU memory, which you can quickly verify by running <code>import torch; torch.ones((1, 1)).to("cuda")</code> and then checking the GPU memory with <code>nvidia-smi</code>.</li>
380
  <li>Some rest memory usage from buffers, intermediate results and some memory that can’t be used due to fragmentation</li>
 
385
 
386
  <p>These items are stored as tensors which come in different <em>shapes</em> and <em>precisions</em>. The <em>shapes</em> are determined by hyper-parameters such as batch size, sequence length, model hidden dimensions, attention heads, vocabulary size, and potential model sharding as we’ll see later. <em>Precision</em> refers to formats like FP32, BF16, or FP8, which respectively require 4, 2, or 1 byte to store each single value in the tensor. We will have a full discussion of the different precisions and their trade-offs in the <a target="_self" href="#mixed_precision_training">Mixed Precision Training</a> section, for now let's just keep in mind that the memory requirements for these various format will be different and that will impact the memory usage of the items we need to store.</p>
387
 
388
+ <p>So how can I quickly determine memory usage from these variable? One simple way is to do this empirically and just measure it.</p>
389
 
390
  <h4>Profiling the memory usage</h4>
391
 
392
+ <p>Using the Pytorch profiler we can understand how memory is allocated througho ut training. We can see that memory utilization is not a static thing but varies a lot during training and during a training step:</p>
393
 
394
  <aside>Check out <a target="_self" href="#a1%3A_distributed_training_profiling" class="">A1: Distributed Training Profiling</a> for a walkthrough how to profile your model.</aside>
395
 
 
434
  \end{aligned}
435
  </d-math>
436
 
437
+ <p>Now let’s have look how things change if we use a lower precision. For stability reason (see <a target="_self" href="#mixed_precision_training">the mixed-precision training section below</a>) we often don't use full low precision training but a mix of higher and lower precision called "mixed precision"<d-cite bibtex-key="micikevicius2018mixedprecisiontraining"></d-cite>. The default nowadays for mixed precision training is to generally use BF16 for most of the computations –requiring 2 bytes per parameter and gradient– as well as an additional copy of the model weights and gradients in FP32, thus 12 bytes per parameter in total. In addition to the parameters and gradient, we need to store the optimizer states: for the Adam optimizer, this requires the momentum and the variance usually stored in FP32 for numerical stability, each using 4 bytes. </p>
438
 
439
  <aside>See some more details below when we cover the ZeRO methods.</aside>
440
 
 
504
 
505
  <p>As we can see, as soon as we reach <strong>7B</strong> (!), weights and optimizer requirements already starts to add up significantly and exceed the size of a typical GPU memory, e.g. 80GB for a H100 GPU.</p>
506
 
507
+ <p>But for now, let’s start with models which still fits in a single GPU, take a look at the last big contributor to our memory budget: the activation memory.</p>
508
 
509
  <h4>Activations memory</h4>
510
 
 
518
 
519
  <p>For the exact derivation of the numbers, you can follow this original NVIDIA paper on recomputation <d-cite bibtex-key="korthikanti2022recomputation"></d-cite>, it essentially requires you to do some accounting of all the sizes of intermediate activations between each operation in a transformer layer.</p>
520
 
521
+ <p>An interesting observation here is how the memory is not static for a given model but it scales linearly with both the sequence length and batch size. This means the activation memory is the part which will blow up when we increase our batch size or train with longer sequences. We can use this equation to look at how memory usage changes for various sequence lengths for example for Llama models (<code>bs=1</code>):</p>
522
 
523
  <div class="l-body-outset" id="fragment-memusage_activations"></div>
524
  <!-- <script>
 
539
 
540
  <h3>Activation recomputation</h3>
541
 
542
+ <p>The general idea behind <strong><em>activation recomputation</em></strong> – also called <em>gradient checkpointing</em> or <em>rematerialization</em> – is to discard some activations during the forward pass to save memory and spend some extra compute to recompute these on the fly during the backward pass. Without recomputation, we store every hidden state between two learnable operations (e.g. feed-forward, layernorm etc.), such that we can use them during the backward pass to compute gradients. When we use recomputation we typically will only store activations at a few key points along the model architecture, discard the rest of activations and recompute them on the fly during the backward pass from the nearest saved activations, basically performing again a sub-part of the forward pass to trade of memory for compute. It generally looks like this:</p>
543
 
544
  <div class="svg-container" id="svg-activation_recomputation"> </div>
545
  <div class="info" id="svg-activation_recomputation-info">Hover over the network elements to see their details</div>
 
548
 
549
  <ul>
550
  <li><strong>Full</strong>: We checkpoint activations at the transition point between each layer of the Transformer model. This is usually called the <code>full</code> strategy since it requires a forward pass through each layer essentially adding a full forward pass during the backward pass. This strategy saves the most memory but is the most expensive one in terms of compute. It generally increases the compute cost and time by up to 30-40% which is very noticeable.</li>
551
+ <li><strong>Selective</strong>: In general we can do better than full. The authors of the recomputation paper<d-cite bibtex-key="korthikanti2022recomputation"></d-cite> did a detailed analysis studying which activations grow the largest and have the cheapest recomputation cost in terms of FLOPs. Turns out that the attention computations fall in that category, and thus we can usually discard them and focus on checkpointing expensive the feedforward computations. For a GPT-3 (175B) model this means <strong>70% activation memory reduction at a 2.7% compute cost</strong>.</li>
552
  </ul>
553
 
554
  <aside>In recent models like DeepSeek V3, selective checkpointing is performed, storing even a smaller size of attention activation β€”using so-called β€œMulti-Head Latent Attention” (MLA)– to optimize activation memory usage.</aside>
 
611
 
612
  <p>But if you’ve carefully followed, you probably noticed that the forward/backward passes for each micro-batch can actually be run in parallel. Forward/backward passes are independent from each other, with independent input samples being the only difference. Seems like it’s time to start extending our training to more than one GPU! </p>
613
 
614
+ <p>Before that, let's quickly see how we can vizualise computation and communication with a short tour of one of the most usefull tool in the distributed training toolbox: the <strong>profiler</strong>. This tool will be extremely usefull to understand and validate how communications between GPUs and compute are happening and where bottlenecks are.</p>
615
 
616
  <h4>Profiling GPU compute and communication</h4>
617
 
 
802
 
803
  <p>While data parallelism nicely overlaps the all-reduce gradient synchronization with backward computation to save time, this benefit starts to break down at large scales. Why? Because as we add more and more GPUs (hundreds or thousands), the overhead of coordinating between them grows significantly and the network requirements are becoming too large for the benefits. As a result, our setup will become less and less efficient which each additional GPU we add to the system.</p>
804
 
805
+ <p>Lets see this happening in practice with some benchmark:</p>
806
 
807
  <!-- <p><img alt="image.png" src="/assets/images/dp_scaling.svg"/></p> -->
808
  <div class="l-body-outset" id="fragment-dp_scaling"></div>
 
864
 
865
  <h4>Memory usage revisited</h4>
866
 
867
+ <p>You likely remember from <a target="_self" href="#memory_usage_in_transformers"> our previous section</a> the memory usage of optimizer states, gradients, and parameters during a standard training. Lets call our model's parameters count <d-math>\Psi</d-math> (previously N but here we use the original ZeRO paper notation). In <a target="_self" href="#mixed_precision_training">Mixed Precision Training</a> (more details in a later section) with the Adam optimizer, the memory usage for each item we need to store is:</p>
868
 
869
  <ul>
870
  <li>Model’s parameters (half precision i.e. bf16/fp16): <d-math>2\Psi</d-math></li>
 
902
  </ul>
903
  <aside>Note: reduce-scatter is 2 times faster than all reduce! <em>Yay, a third communication primitive!</em></aside>
904
 
905
+ <p>You may be wondering what is this "reduce-scatter" operation and how this all look so lets try to make this more graphical with the figure below. We'll go over all the steps of a forward/backward pass cycle:</p>
906
 
907
  <p><img alt="dp_zero1.gif" src="/assets/images/dp_zero1.gif" /></p>
908
 
 
1320
  <ul>
1321
  <li>for both methods we notice the biggest performance drop when we move from TP=8 to TP=16, because that’s when we move from only communicating within a single node (NVLink), to communicating inter-nodes (EFA)</li>
1322
  <li>the memory savings in activations when using TP with SP helps us fit far bigger batches than TP alone</li>
1323
+ <li>the memory savings in activations when using TP with SP helps us fit far bigger batches than TP alone</li>
1324
  </ul>
1325
 
1326
  <p><strong>We have seen how TP helps us shard activations across several GPUs by splitting the attention and feedforward operations along the hidden dimension and how SP is a natural complement for the remaining operations by splitting along the sequence dimension.</strong></p>
 
1354
 
1355
  <!-- <p><img alt="image.png" src="/assets/images/cp_memoryusage.svg" /></p> -->
1356
 
1357
+ <p>The core idea of Context Parrallelism is to apply a similar idea to the Sequence Parallelism approach (aka to split along the sequence length) but to the modules where we already apply Tensor Parallelism. We will thus split these modules along two dimensions, thereby also reducing the effect of sequence length. You will find this approach quite intuitive after all we’ve already convered but... there is a trick to it so stay awake!</p>
1358
 
1359
+ <p>For Context Parallelism; just like Sequence Parallelism, we’ll split the input along the sequence dimension but we now apply this splitting along the full model, instead of only the sequence parallel regions of the model as we’ve done previous with Tensor + Sequence Parallelism.</p>
1360
 
1361
  <!-- <p><img alt="cp_8Bmemoryusage.svg" src="/assets/images/cp_8Bmemoryusage.svg" /></p>
1362
  -->
 
1364
 
1365
  <p>There is one important exception though as we we need to pay particular attention to the <strong>Attention blocks</strong> (haha.. pun intended :D). In the attention module each token needs to access key/value pairs from <strong>all</strong> other sequence tokens or in the case of causal attention at least attends to each previous token.</p>
1366
 
1367
+ <p>Because Context Parallelism splits the inputs along the sequence dimension across GPUs, the attention module will requires full communication between GPUs to exchange the necessary key/value data.</p>
1368
 
1369
  <p>That sounds very expensive if we do it naively. Is there a way to do this rather efficiently and fast! Thankfully there is: a core technique to handle this communication of key/value pairs efficiently is called <em>Ring Attention</em>.</p>
1370
 
 
1504
  </div>
1505
  <p>The remaining idle time is indicated in grey and usually called the β€œbubble” and the sight of this probably break your heart after we spent so much time optimizing throughput.</p>
1506
 
1507
+ <p>We can quantify how efficient a pipeline setup is by looking at how much time we loose because of the bubble. Let’s say <d-math>t_f</d-math> and <d-math>t_b</d-math> are the times for the forward and backward pass, respectively, as measured for one microbatch and one stage of the pipeline (a simple assumption is often to have <d-math>t_b \approx 2 \times t_f</d-math> which you can see on the above graph). If we could perfectly parallelize the ideal total time would be <d-math>t_{id}=t_f + t_b</d-math>. However, we can count on the graph that due to the pipeline bubble there is additional time of <d-math>t_{pb}=(p-1)*(t_f+t_b)</d-math> (where <d-math>p</d-math> is the degree of pipeline parallelism, i.e the number of GPU on the above graph) ie. the time each GPU is waiting while other GPUs are computing.</p>
1508
 
1509
  <p>We can compute the ratio of the additional bubble time over the ideal time:
1510
  </p>
 
1592
 
1593
  <p><img alt="pp_1f1b_interleaved.svg" src="/assets/images/pp_1f1b_interleaved.svg" /></p>
1594
 
1595
+ <div class="figure-legend"><p>An example of interleaved pipeline parallelism for a model with layers distributed across 4 GPUs. Numbers still correspond to the microbatches IDs but for clarity we've colored differently the first and the last layers of the model to illustrate how layers are spread accross GPUs.</p>
1596
  </div>
1597
 
1598
  <p>As a consequence we see additional communications happening as the model goes several times through each GPU for the same computation that previously just took one pass. However, each forward and backward pass is divided by a factor of <d-math>v</d-math>, where <d-math>v</d-math> is the number of stages or model chunks per GPUs as we are able to better interleave forward and backward passes. </p>
 
1746
  </table>
1747
  </div>
1748
 
1749
+ <p>As you can see, ZeRO-3 and PP sove the same challenge but involve different approaches and the choice between both will depend whether you decide to focus communication either on weights or on activations. While they can be combined, it's not often done in practice as doing so requires increasing the global batch size significantly to amortize the communication costs, creating a tradeoff between global batch size, model size, network bandwidth, and training efficiency. If you decide to combine them, ZeRO-3 should be configured to keep the weights in memory during the series of PP micro-batches to minimize as much as possible un-necessary communication overhead.</p>
1750
 
1751
  <p>On the other hand, ZeRO-1 and ZeRO-2, which focus on optimizer states and gradients, can be easily combined with Pipeline Parallelism and are complementary to it. Combining them don't raise any particular new challenge. For instance, the training of DeepSeek-v3 used PP combined with ZeRO-1 (sic).</p>
1752
 
 
1794
  <li>Tensor Parallelism (and Sequence Parallelism) affects computation throughout the entire model by sharding both weights and activations.</li>
1795
  <li>Context Parallelism primarily impacts attention layers since that's where cross-sequence communication is required, with other layers operating independently on sharded sequences.</li>
1796
  <li>Expert Parallelism primarly affects the MoE layers (which replace standard MLP blocks), leaving attention and other components unchanged</li>
1797
+ <li>Pipeline Parallelism and ZeRO are not especially specific to any sub-module or component with the exception that modules and layers need to be balanced in Pipaline Parallelism, the first and last layers are thus often treated differently due to the additional embedding layers.</li>
1798
  </ul>
1799
 
1800
  <table>
 
1913
 
1914
  <h2>Finding the Best Training Configuration</h2>
1915
 
1916
+ <p>We’ve now covered all the parallelism techniques that are actually used to distribute and training larger models as well as how and why they can be combined together. There remain a general question: which ones should we choose in the end and how to decide on a specific combination?</p>
1917
 
1918
  <p>We touched this a little bit in the previous section but let's now walk in details through a possible decision process, step by step, keeping in mind that you'll always have to run a few experiments to find the definitive optimal setup for your compute cluster given its various physical properties, network bandwidth, GPUs per node, memory per GPU, etc.</p>
1919
 
 
2016
 
2017
  <h3>Lessons learned on benchmarking</h3>
2018
 
2019
+ <p>Our goal for this book was not only to discuss theory and implementations but provide actual data points as well. So the plan was simple: lets run every possible distributed configuration for every model and a number of cluster sizes (namely 1-64 nodes of 8xH100s). Even after excluding impossible configuration we still needed to run thousands of experiments. </p>
2020
 
2021
  <p>
2022
  On paper this sounds easy enough: we can easily launch big arrays of jobs on our cluster. However, as soon as we launched the first batches of experiments, troubles began:
 
2273
 
2274
  </ol>
2275
 
2276
+ <p>Let’s talk about one of the most frequent technique we can use in CUDA: optimizing memory access. The global memory in GPUs (the largest memory in our above graph) has a long latency and low bandwidth in comparison to the cache which often creates a major bottleneck for most applications. Efficiently accessing data from global memory can improve a lot the performance.</p>
2277
 
2278
  <h4>Memory Coalescing</h4>
2279
 
 
2411
 
2412
  <p>So it seems warps are stalling waiting for shared memory accesses to return! To solve this issue we can apply a technique called <strong>Thread Coarsening</strong> which involves merging several threads into a single coarsened thread. This will significantly reduce shared memory accesses as each coarsened thread can handle multiple output elements.</p>
2413
 
2414
+ <p>Let's briefly mentionned a last important consideration when writing or improving custom kernels: <strong>Minimizing Control Divergence</strong>.</p>
2415
 
2416
  <h4>Minimizing Control Divergence</h4>
2417
 
 
2566
 
2567
  <p>We can see that float32 spans 80 orders of magnitude and float16 sacrifices a lot of range while bfloat16 maintains the full range. The two float8 formats reduce the range even further where e5e2 can maintain float16 range and e4m3 has an even smaller ranger.</p>
2568
 
2569
+ <p>How come some format are able to maintain the range and other not? Let’s investigate the resolution by plotting 10,000 points between 1 and 2. Each point will be rounded to the nearest representable number in each format:</p>
2570
 
2571
  <p><img alt="image.png" src="/assets/images/mixedprecision_2.png" /></p>
2572
 
2573
  <p>We can see here that bfloat16 maintained the range of float32 over float16 but did this with the cost of sacrificing more precision. In case of float8 the situation is even more dire as e4m3 can represent 7 and e5m2 only 3 number on the interval 1-2.</p>
2574
 
2575
+ <p>A common metric to measure a formats resolution is epsilon: the first representable number after <d-math>1.00</d-math>. We can see that for the float32 format <d-math>10^{-4}</d-math> is an upper bound (it’s actually <d-math>1.19^{-7}</d-math>). For float16 it is <d-math>\tilde 10^{-3}</d-math> and for bfloat 10x higher still.</p>
2576
 
2577
  <p>The idea of mixed precision training is to use some of these lower precisions formats while maintaining the performance of full precision training. </p>
2578
 
 
3047
  print(f"After broadcast on rank {dist.get_rank()}: {tensor}")
3048
 
3049
  init_process()
3050
+ example_broadcats()
3051
  </d-code>
3052
 
3053
 
 
3380
 
3381
  <p>There are a few finer points in the decision tree that we leave to the reader to explore in the PyTorch guide referenced above.</p>
3382
 
3383
+ <p>Now that we covered the fundamental operations for distributed training and when you should should be ready to follow the blog post easily.</p>
3384
 
3385
  <h3>A1: Distributed Training Profiling</h3>
3386
 
 
3829
  }
3830
  if (level === 0)
3831
  ToC += '<div>' + link + '</div>';
3832
+ else if (level === 1)
 
3833
  ToC += '<li>' + link + '</li>';
3834
  }
3835
 
dist/main.bundle.js CHANGED
@@ -5544,133 +5544,14 @@ function _loadFragments() {
5544
  return _loadFragments.apply(this, arguments);
5545
  }
5546
 
5547
- ;// ./src/syncHFSpacesURLHash.js
5548
- var queryArg = "section";
5549
- function syncHFSpacesURLHash() {
5550
- // Handle explicit section requests (don't update hash automatically on load)
5551
- var hasExplicitRequest = handleExplicitSectionRequest();
5552
-
5553
- // Set up hash change monitoring
5554
- updateHashBasedOnHashChange();
5555
-
5556
- // Always set up scroll monitoring to update hash during scrolling
5557
- setupScrollMonitoring();
5558
-
5559
- // If no explicit request, we don't update the hash on initial load
5560
- // The hash will only start updating when the user scrolls
5561
- }
5562
- function handleExplicitSectionRequest() {
5563
- // Check for section parameter in URL
5564
- var urlParams = new URLSearchParams(window.location.search);
5565
- var sectionId = urlParams.get(queryArg);
5566
-
5567
- // If we have an explicit section request
5568
- if (sectionId) {
5569
- var targetElement = document.getElementById(sectionId);
5570
- if (targetElement) {
5571
- // Slight delay to ensure the browser doesn't try to do its own scrolling first
5572
- setTimeout(function () {
5573
- targetElement.scrollIntoView();
5574
- history.replaceState(null, null, "#".concat(sectionId));
5575
- }, 100);
5576
- }
5577
- return true;
5578
- }
5579
-
5580
- // No explicit section parameter found
5581
- return false;
5582
- }
5583
- function setupScrollMonitoring() {
5584
- // Variables to manage throttling
5585
- var isScrolling = false;
5586
- var lastKnownScrollPosition = 0;
5587
- var initialScroll = true;
5588
-
5589
- // Add the scroll event listener
5590
- window.addEventListener('scroll', function () {
5591
- lastKnownScrollPosition = window.scrollY;
5592
- if (!isScrolling) {
5593
- window.requestAnimationFrame(function () {
5594
- // Skip the first scroll event which might be browser's automatic scroll
5595
- // to a hash on page load
5596
- if (initialScroll) {
5597
- initialScroll = false;
5598
- } else {
5599
- updateHashBasedOnScroll(lastKnownScrollPosition);
5600
- }
5601
- isScrolling = false;
5602
- });
5603
- }
5604
- isScrolling = true;
5605
- });
5606
- }
5607
-
5608
- // Function to update the URL hash based on scroll position
5609
- function updateHashBasedOnScroll(scrollPosition) {
5610
- var closestHeading = findClosestHeading(scrollPosition);
5611
-
5612
- // Update the URL hash if we found a closest element
5613
- if (closestHeading && closestHeading.id) {
5614
- // Only update if the hash is different to avoid unnecessary operations
5615
- if (window.location.hash !== "#".concat(closestHeading.id)) {
5616
- silentlyUpdateHash(closestHeading.id);
5617
- postMessageToHFSpaces(closestHeading.id);
5618
- }
5619
- }
5620
- }
5621
-
5622
- // Find the closest heading to the current scroll position
5623
- function findClosestHeading(scrollPosition) {
5624
- // Get only heading elements with IDs that we want to track
5625
- var headingsWithIds = Array.from(document.querySelectorAll('h1[id], h2[id], h3[id], h4[id], h5[id], h6[id]'));
5626
-
5627
- // Skip if there are no headings with IDs
5628
- if (headingsWithIds.length === 0) return null;
5629
-
5630
- // Find the element closest to the middle of the viewport
5631
- var closestHeading = null;
5632
- var closestDistance = Infinity;
5633
- var viewportMiddle = scrollPosition + window.innerHeight / 2;
5634
-
5635
- // Iterate through all headings to find the closest one
5636
- headingsWithIds.forEach(function (heading) {
5637
- var headingTop = heading.getBoundingClientRect().top + scrollPosition;
5638
- var distance = Math.abs(headingTop - viewportMiddle);
5639
- if (distance < closestDistance) {
5640
- closestDistance = distance;
5641
- closestHeading = heading;
5642
- }
5643
- });
5644
- return closestHeading;
5645
- }
5646
-
5647
- // Update hash without triggering scroll or other side effects
5648
- function silentlyUpdateHash(id) {
5649
- history.replaceState(null, null, "#".concat(id));
5650
- }
5651
- function updateHashBasedOnHashChange() {
5652
- window.addEventListener('hashchange', function () {
5653
- var elementId = window.location.hash.slice(1);
5654
- postMessageToHFSpaces(elementId);
5655
- });
5656
- }
5657
- function postMessageToHFSpaces(elementId) {
5658
- var parentOrigin = "https://huggingface.co";
5659
- window.parent.postMessage({
5660
- queryString: "".concat(queryArg, "=").concat(elementId)
5661
- }, parentOrigin);
5662
- }
5663
-
5664
  ;// ./src/index.js
5665
  // import { plotClusters } from './clusters'
5666
 
5667
 
5668
-
5669
  document.addEventListener("DOMContentLoaded", function () {
5670
  console.log("DOMContentLoaded");
5671
  loadFragments();
5672
  init_memory_plot();
5673
- syncHFSpacesURLHash();
5674
  }, {
5675
  once: true
5676
  });
 
5544
  return _loadFragments.apply(this, arguments);
5545
  }
5546
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5547
  ;// ./src/index.js
5548
  // import { plotClusters } from './clusters'
5549
 
5550
 
 
5551
  document.addEventListener("DOMContentLoaded", function () {
5552
  console.log("DOMContentLoaded");
5553
  loadFragments();
5554
  init_memory_plot();
 
5555
  }, {
5556
  once: true
5557
  });
dist/main.bundle.js.map CHANGED
The diff for this file is too large to render. See raw diff
 
src/bibliography.bib CHANGED
@@ -488,7 +488,7 @@ url = {https://github.com/meta-llama/llama3/blob/main/MODEL_CARD.md}
488
  @software{torchao,
489
  title = {torchao: PyTorch native quantization and sparsity for training and inference},
490
  author = {torchao maintainers and contributors},
491
- url = {https://github.com/pytorch/ao},
492
  license = {BSD-3-Clause},
493
  month = oct,
494
  year = {2024}
 
488
  @software{torchao,
489
  title = {torchao: PyTorch native quantization and sparsity for training and inference},
490
  author = {torchao maintainers and contributors},
491
+ url = {https://github.com/pytorch/torchao},
492
  license = {BSD-3-Clause},
493
  month = oct,
494
  year = {2024}
src/distill.js CHANGED
@@ -2105,12 +2105,6 @@ d-appendix > distill-appendix {
2105
  <div>Feb 19, 2025</div>
2106
  </div>
2107
  </div>
2108
- <div class="side pdf-download">
2109
- <a href="https://huggingface.co/spaces/nanotron/ultrascale-playbook/resolve/main/The_Ultra-Scale_Playbook_Training_LLMs_on_GPU_Clusters.pdf">Download PDF
2110
- <br>
2111
- <img style="width: 32px;" src="../assets/images/256px-PDF.png" alt="PDF"></a>
2112
-
2113
- </div>
2114
  `;
2115
  }
2116
 
 
2105
  <div>Feb 19, 2025</div>
2106
  </div>
2107
  </div>
 
 
 
 
 
 
2108
  `;
2109
  }
2110
 
src/index.html CHANGED
@@ -73,25 +73,25 @@
73
  </d-contents>
74
 
75
  <p>
76
- Thousands of GPUs humming in perfect harmony. That's what it takes to train today's most powerful AI models – a symphony of computing power that until recently was the exclusive domain of elite research labs. Open source has transformed this landscape, but not completely. Yes, you can download the latest <a href="https://huggingface.co/meta-llama">Llama</a> or <a href="https://huggingface.co/deepseek-ai">DeepSeek</a> models. Yes, you can read their <a href="https://ai.meta.com/research/publications/the-llama-3-herd-of-models/">technical</a> and <a href="https://github.com/deepseek-ai/DeepSeek-R1/blob/main/DeepSeek_R1.pdf">experiment</a> reports. But the most challenging part – the training code, the knowledge and techniques necessary to coordinate GPUs to train these massive systems – remains shrouded in complexity and spread around a series of disconnected papers and often private codebases.
77
  </p>
78
- <aside>Reading time: 2-4 days. <br>For the best reading experience, we recommend not using a mobile phone.</aside>
79
  <p>
80
- This open-source book is here to change that. Starting from the basics, we'll walk you through the knowledge necessary to scale the training of large language models from one GPU to tens, hundreds and even thousands of GPUs, illustrating theory with practical code examples and reproducible benchmarks.
81
  </p>
82
 
83
- <p>As the size of the clusters used to train these models grew, various techniques such as data parallelism, tensor parallelism, pipeline parallelism or context parallelism as well as ZeRO or kernel fusion have been invented to makes sure that GPUs are highly utilized at all times. This significantly reduces training time and makes the best use of this expensive hardware. Even more, as the challenge of scaling up AI training goes beyond just building the initial models and teams have found that fine-tuning large models on specialized data often produces the best results, generally involving the same distributed training techniques. In this book we'll progressively go over all of these techniques –from the simplest to the most refined ones– while keeping a single story-line to understand where each method comes from.</p>
84
 
85
  <aside>If you have questions or remarks open a discussion on the <a href="https://huggingface.co/spaces/nanotron/ultrascale-playbook/discussions?status=open&type=discussion">Community tab</a>!</aside>
86
 
87
- <p>We'll assume you have some simple basic knowledge about current LLM architecture and are roughtly familiar with how deep learning model are trained, but you can be generally new to distributed training. If needed, the basics of model training can be found in great courses found at <a href="https://www.deeplearning.ai">DeepLearning.ai</a> or on the <a href="https://pytorch.org/tutorials/beginner/basics/intro.html">PyTorch tutorial sections</a>. This book can be seen as the second part of a trilogy following our first blog on processing data for pre-training, the so-called β€œ<a href="https://huggingface.co/spaces/HuggingFaceFW/blogpost-fineweb-v1">FineWeb blog post</a>”. Having read both blog posts, you should have almost all the core knowledge needed to fully understand how how performing LLMs are being built nowadays, just missing some final spices regarding data mixing and architecture choices to complete the recipe (stay tuned for part three…).</p>
88
 
89
  <aside>We are extremely thankful to the whole <a href="https://distill.pub/">distill.pub</a> team for creating
90
  the template on which we based this blog post.</aside>
91
 
92
  <p>The book is built on the following <strong>three general foundations</strong>:</p>
93
 
94
- <p><strong>Quick intros on theory and concepts:</strong> before diving into code and experiments, we want to understand how each method works at a high level and what its advantages and limits are. You’ll learn about which parts of a language model eat away your memory and when during training it happens. You’ll learn how we can solve memory constraints by parallelizing the models and increase the throughput by scaling up GPUs. As a result you'll understand how the following widget to compute the memory breakdown of a transformer model works: </p>
95
  <aside>Note that we're still missing Pipeline Parallelism in this widget. To be added as an exercise for the reader.</aside>
96
 
97
  <div class="large-image-background-transparent">
@@ -268,7 +268,7 @@
268
  <ol>
269
  <li><strong>Memory Usage</strong>: it's a hard limitation - if a training step doesn't fit in memory, training cannot proceed</li>
270
  <li><strong>Compute Efficiency</strong>: we want our hardware to spend most time computing, so we need to reduce time spent on data transfers or waiting for other GPUs to perform work.</li>
271
- <li><strong>Communication overhead</strong>: we want to minimize communication overhead as it keeps GPUs idle. To achieve this we will try to make best use of intra-node (fast) and inter-node (slower) bandwidths as well as overlap communication with compute as much as possible.</li>
272
  </ol>
273
  <p>In many places we'll see that we can trade one of these (computation, communication, memory) for another (e.g. recomputation or Tensor Parallelism). Finding the right balance is key to scaling training.</p>
274
  <p>
@@ -341,7 +341,7 @@
341
 
342
  <aside>For instance, during DeepSeek-V3/R1 training β€œthe batch size is gradually increased from 3072 input sequences to 15360 in the training of the first 469B tokens, and then keeps at 15360 input samples in the remaining training”.</aside>
343
 
344
- <p>Batch size also affects the time it takes to train on a given text dataset: a small batch size will require more optimizer steps to train on the same amount of samples. Optimizer steps are costly (in compute time) and the total time to train will thus increase compared to using a larger batch size. This being said, note that the batch size can often be adjusted quite largely around the optimal batch size without major impact on the performance of the model, i.e. the sensitivity of final model performances to the exact batch size value is usually rather low around the optimal batch size.</p>
345
 
346
  <p>In the LLM pretraining community, batch sizes are commonly reported in terms of tokens rather than in number of samples (<d-math>bst</d-math> = Batch Size Tokens), this makes training numbers generally independent of the exact input sequence length used during the training.</p>
347
 
@@ -353,7 +353,7 @@
353
 
354
  <p>From here onward we’ll show the formulas for the batch size in terms of samples but you can always get its token-unit counterpart by multiplying it with the sequence length.</p>
355
 
356
- <p>A sweet spot for recent LLM training is typically on the order of 4-60 million tokens per batch. The batch size as well as the training corpus have been steadily increasing over the years: Llama 1 was trained with a batch size of ~4M tokens for 1.4 trillion tokens while DeepSeek was trained with a batch size of ~60M tokens for 14 trillion tokens.</p>
357
 
358
  <p><strong>And our first challenge is already coming ahead when scaling the training of our model to these large batch sizes: out-of-memory issues. What should we do when our GPU doesn’t have enough memory to hold a full batch of our target batch size?</strong></p>
359
 
@@ -361,7 +361,7 @@
361
 
362
  <h3>Memory usage in Transformers</h3>
363
 
364
- <p>When training a neural network model, one stores several items in memory:</p>
365
 
366
  <ul>
367
  <li>Model weights</li>
@@ -374,7 +374,7 @@
374
  <p class="note-box-title">πŸ“ Note</p>
375
  <div class="note-box-content">
376
  <p>
377
- You would think for a model you could compute the memory requirements exactly but there are a few additional memory occupants that make it hard to be exact:
378
  <ul>
379
  <li>CUDA Kernels typically require 1-2 GB of GPU memory, which you can quickly verify by running <code>import torch; torch.ones((1, 1)).to("cuda")</code> and then checking the GPU memory with <code>nvidia-smi</code>.</li>
380
  <li>Some rest memory usage from buffers, intermediate results and some memory that can’t be used due to fragmentation</li>
@@ -385,11 +385,11 @@
385
 
386
  <p>These items are stored as tensors which come in different <em>shapes</em> and <em>precisions</em>. The <em>shapes</em> are determined by hyper-parameters such as batch size, sequence length, model hidden dimensions, attention heads, vocabulary size, and potential model sharding as we’ll see later. <em>Precision</em> refers to formats like FP32, BF16, or FP8, which respectively require 4, 2, or 1 byte to store each single value in the tensor. We will have a full discussion of the different precisions and their trade-offs in the <a target="_self" href="#mixed_precision_training">Mixed Precision Training</a> section, for now let's just keep in mind that the memory requirements for these various format will be different and that will impact the memory usage of the items we need to store.</p>
387
 
388
- <p>So how can I quickly determine memory usage from these variables? One simple way is to do this empirically and just measure it.</p>
389
 
390
  <h4>Profiling the memory usage</h4>
391
 
392
- <p>Using the Pytorch profiler we can understand how memory is allocated throughout training. We can see that memory utilization is not a static thing but varies a lot during training and during a training step:</p>
393
 
394
  <aside>Check out <a target="_self" href="#a1%3A_distributed_training_profiling" class="">A1: Distributed Training Profiling</a> for a walkthrough how to profile your model.</aside>
395
 
@@ -434,7 +434,7 @@
434
  \end{aligned}
435
  </d-math>
436
 
437
- <p>Now let’s have look how things change if we use a lower precision. For stability reasons (see <a target="_self" href="#mixed_precision_training">the mixed-precision training section below</a>) we often don't use full low precision training but a mix of higher and lower precision called "mixed precision"<d-cite bibtex-key="micikevicius2018mixedprecisiontraining"></d-cite>. The default nowadays for mixed precision training is to generally use BF16 for most of the computations –requiring 2 bytes per parameter and gradient– as well as an additional copy of the model weights and gradients in FP32, thus 12 bytes per parameter in total. In addition to the parameters and gradient, we need to store the optimizer states: for the Adam optimizer, this requires the momentum and the variance usually stored in FP32 for numerical stability, each using 4 bytes. </p>
438
 
439
  <aside>See some more details below when we cover the ZeRO methods.</aside>
440
 
@@ -504,7 +504,7 @@
504
 
505
  <p>As we can see, as soon as we reach <strong>7B</strong> (!), weights and optimizer requirements already starts to add up significantly and exceed the size of a typical GPU memory, e.g. 80GB for a H100 GPU.</p>
506
 
507
- <p>But for now, let’s start with models which still fit in a single GPU, take a look at the last big contributor to our memory budget: the activation memory.</p>
508
 
509
  <h4>Activations memory</h4>
510
 
@@ -518,7 +518,7 @@
518
 
519
  <p>For the exact derivation of the numbers, you can follow this original NVIDIA paper on recomputation <d-cite bibtex-key="korthikanti2022recomputation"></d-cite>, it essentially requires you to do some accounting of all the sizes of intermediate activations between each operation in a transformer layer.</p>
520
 
521
- <p>An interesting observation here is that memory usage is not static for a given model; rather, it scales linearly with the batch size and quadratically with the sequence length. This means the activation memory is the part which will blow up when we increase our batch size or train with longer sequences. We can use this equation to look at how memory usage changes for various sequence lengths for example for Llama models (<code>bs=1</code>):</p>
522
 
523
  <div class="l-body-outset" id="fragment-memusage_activations"></div>
524
  <!-- <script>
@@ -539,7 +539,7 @@
539
 
540
  <h3>Activation recomputation</h3>
541
 
542
- <p>The general idea behind <strong><em>activation recomputation</em></strong> – also called <em>gradient checkpointing</em> or <em>rematerialization</em> – is to discard some activations during the forward pass to save memory and spend some extra compute to recompute these on the fly during the backward pass. Without recomputation, we store every hidden state between two learnable operations (e.g. feed-forward, layernorm etc.), such that we can use them during the backward pass to compute gradients. When we use recomputation we typically will only store activations at a few key points along the model architecture, discard the rest of activations and recompute them on the fly during the backward pass from the nearest saved activations, basically performing again a sub-part of the forward pass to trade off memory for compute. It generally looks like this:</p>
543
 
544
  <div class="svg-container" id="svg-activation_recomputation"> </div>
545
  <div class="info" id="svg-activation_recomputation-info">Hover over the network elements to see their details</div>
@@ -548,7 +548,7 @@
548
 
549
  <ul>
550
  <li><strong>Full</strong>: We checkpoint activations at the transition point between each layer of the Transformer model. This is usually called the <code>full</code> strategy since it requires a forward pass through each layer essentially adding a full forward pass during the backward pass. This strategy saves the most memory but is the most expensive one in terms of compute. It generally increases the compute cost and time by up to 30-40% which is very noticeable.</li>
551
- <li><strong>Selective</strong>: In general we can do better than full. The authors of the recomputation paper<d-cite bibtex-key="korthikanti2022recomputation"></d-cite> did a detailed analysis studying which activations grow the largest and have the cheapest recomputation cost in terms of FLOPs. Turns out that the attention computations fall in that category, and thus we can usually discard them and focus on checkpointing the expensive feedforward computations. For a GPT-3 (175B) model this means <strong>70% activation memory reduction at a 2.7% compute cost</strong>.</li>
552
  </ul>
553
 
554
  <aside>In recent models like DeepSeek V3, selective checkpointing is performed, storing even a smaller size of attention activation β€”using so-called β€œMulti-Head Latent Attention” (MLA)– to optimize activation memory usage.</aside>
@@ -611,7 +611,7 @@
611
 
612
  <p>But if you’ve carefully followed, you probably noticed that the forward/backward passes for each micro-batch can actually be run in parallel. Forward/backward passes are independent from each other, with independent input samples being the only difference. Seems like it’s time to start extending our training to more than one GPU! </p>
613
 
614
- <p>Before that, let's quickly see how we can vizualise computation and communication with a short tour of one of the most useful tool in the distributed training toolbox: the <strong>profiler</strong>. This tool will be extremely useful to understand and validate how communications between GPUs and compute are happening and where bottlenecks are.</p>
615
 
616
  <h4>Profiling GPU compute and communication</h4>
617
 
@@ -802,7 +802,7 @@
802
 
803
  <p>While data parallelism nicely overlaps the all-reduce gradient synchronization with backward computation to save time, this benefit starts to break down at large scales. Why? Because as we add more and more GPUs (hundreds or thousands), the overhead of coordinating between them grows significantly and the network requirements are becoming too large for the benefits. As a result, our setup will become less and less efficient which each additional GPU we add to the system.</p>
804
 
805
- <p>Let's see this happening in practice with some benchmark:</p>
806
 
807
  <!-- <p><img alt="image.png" src="/assets/images/dp_scaling.svg"/></p> -->
808
  <div class="l-body-outset" id="fragment-dp_scaling"></div>
@@ -864,7 +864,7 @@
864
 
865
  <h4>Memory usage revisited</h4>
866
 
867
- <p>You likely remember from <a target="_self" href="#memory_usage_in_transformers"> our previous section</a> the memory usage of optimizer states, gradients, and parameters during a standard training. Let's call our model's parameters count <d-math>\Psi</d-math> (previously N but here we use the original ZeRO paper notation). In <a target="_self" href="#mixed_precision_training">Mixed Precision Training</a> (more details in a later section) with the Adam optimizer, the memory usage for each item we need to store is:</p>
868
 
869
  <ul>
870
  <li>Model’s parameters (half precision i.e. bf16/fp16): <d-math>2\Psi</d-math></li>
@@ -902,7 +902,7 @@
902
  </ul>
903
  <aside>Note: reduce-scatter is 2 times faster than all reduce! <em>Yay, a third communication primitive!</em></aside>
904
 
905
- <p>You may be wondering what is this "reduce-scatter" operation and how this all look so let's try to make this more graphical with the figure below. We'll go over all the steps of a forward/backward pass cycle:</p>
906
 
907
  <p><img alt="dp_zero1.gif" src="/assets/images/dp_zero1.gif" /></p>
908
 
@@ -1320,6 +1320,7 @@
1320
  <ul>
1321
  <li>for both methods we notice the biggest performance drop when we move from TP=8 to TP=16, because that’s when we move from only communicating within a single node (NVLink), to communicating inter-nodes (EFA)</li>
1322
  <li>the memory savings in activations when using TP with SP helps us fit far bigger batches than TP alone</li>
 
1323
  </ul>
1324
 
1325
  <p><strong>We have seen how TP helps us shard activations across several GPUs by splitting the attention and feedforward operations along the hidden dimension and how SP is a natural complement for the remaining operations by splitting along the sequence dimension.</strong></p>
@@ -1353,9 +1354,9 @@
1353
 
1354
  <!-- <p><img alt="image.png" src="/assets/images/cp_memoryusage.svg" /></p> -->
1355
 
1356
- <p>The core idea of Context Parallelism is to apply a similar idea to the Sequence Parallelism approach (aka to split along the sequence length) but to the modules where we already apply Tensor Parallelism. We will thus split these modules along two dimensions, thereby also reducing the effect of sequence length. You will find this approach quite intuitive after all we’ve already convered but... there is a trick to it so stay awake!</p>
1357
 
1358
- <p>For Context Parallelism; just like Sequence Parallelism, we’ll split the input along the sequence dimension but we now apply this splitting along the full model, instead of only the sequence parallel regions of the model as we’ve done previously with Tensor + Sequence Parallelism.</p>
1359
 
1360
  <!-- <p><img alt="cp_8Bmemoryusage.svg" src="/assets/images/cp_8Bmemoryusage.svg" /></p>
1361
  -->
@@ -1363,7 +1364,7 @@
1363
 
1364
  <p>There is one important exception though as we we need to pay particular attention to the <strong>Attention blocks</strong> (haha.. pun intended :D). In the attention module each token needs to access key/value pairs from <strong>all</strong> other sequence tokens or in the case of causal attention at least attends to each previous token.</p>
1365
 
1366
- <p>Because Context Parallelism splits the inputs along the sequence dimension across GPUs, the attention module will require full communication between GPUs to exchange the necessary key/value data.</p>
1367
 
1368
  <p>That sounds very expensive if we do it naively. Is there a way to do this rather efficiently and fast! Thankfully there is: a core technique to handle this communication of key/value pairs efficiently is called <em>Ring Attention</em>.</p>
1369
 
@@ -1503,7 +1504,7 @@
1503
  </div>
1504
  <p>The remaining idle time is indicated in grey and usually called the β€œbubble” and the sight of this probably break your heart after we spent so much time optimizing throughput.</p>
1505
 
1506
- <p>We can quantify how efficient a pipeline setup is by looking at how much time we lose because of the bubble. Let’s say <d-math>t_f</d-math> and <d-math>t_b</d-math> are the times for the forward and backward pass, respectively, as measured for one microbatch and one stage of the pipeline (a simple assumption is often to have <d-math>t_b \approx 2 \times t_f</d-math> which you can see on the above graph). If we could perfectly parallelize the ideal total time would be <d-math>t_{id}=t_f + t_b</d-math>. However, we can count on the graph that due to the pipeline bubble there is additional time of <d-math>t_{pb}=(p-1)*(t_f+t_b)</d-math> (where <d-math>p</d-math> is the degree of pipeline parallelism, i.e the number of GPU on the above graph) ie. the time each GPU is waiting while other GPUs are computing.</p>
1507
 
1508
  <p>We can compute the ratio of the additional bubble time over the ideal time:
1509
  </p>
@@ -1591,7 +1592,7 @@
1591
 
1592
  <p><img alt="pp_1f1b_interleaved.svg" src="/assets/images/pp_1f1b_interleaved.svg" /></p>
1593
 
1594
- <div class="figure-legend"><p>An example of interleaved pipeline parallelism for a model with layers distributed across 4 GPUs. Numbers still correspond to the microbatches IDs but for clarity we've colored differently the first and the last layers of the model to illustrate how layers are spread across GPUs.</p>
1595
  </div>
1596
 
1597
  <p>As a consequence we see additional communications happening as the model goes several times through each GPU for the same computation that previously just took one pass. However, each forward and backward pass is divided by a factor of <d-math>v</d-math>, where <d-math>v</d-math> is the number of stages or model chunks per GPUs as we are able to better interleave forward and backward passes. </p>
@@ -1745,7 +1746,7 @@
1745
  </table>
1746
  </div>
1747
 
1748
- <p>As you can see, ZeRO-3 and PP solve the same challenge but involve different approaches and the choice between both will depend whether you decide to focus communication either on weights or on activations. While they can be combined, it's not often done in practice as doing so requires increasing the global batch size significantly to amortize the communication costs, creating a tradeoff between global batch size, model size, network bandwidth, and training efficiency. If you decide to combine them, ZeRO-3 should be configured to keep the weights in memory during the series of PP micro-batches to minimize as much as possible un-necessary communication overhead.</p>
1749
 
1750
  <p>On the other hand, ZeRO-1 and ZeRO-2, which focus on optimizer states and gradients, can be easily combined with Pipeline Parallelism and are complementary to it. Combining them don't raise any particular new challenge. For instance, the training of DeepSeek-v3 used PP combined with ZeRO-1 (sic).</p>
1751
 
@@ -1793,7 +1794,7 @@
1793
  <li>Tensor Parallelism (and Sequence Parallelism) affects computation throughout the entire model by sharding both weights and activations.</li>
1794
  <li>Context Parallelism primarily impacts attention layers since that's where cross-sequence communication is required, with other layers operating independently on sharded sequences.</li>
1795
  <li>Expert Parallelism primarly affects the MoE layers (which replace standard MLP blocks), leaving attention and other components unchanged</li>
1796
- <li>Pipeline Parallelism and ZeRO are not especially specific to any sub-module or component with the exception that modules and layers need to be balanced in Pipeline Parallelism, the first and last layers are thus often treated differently due to the additional embedding layers.</li>
1797
  </ul>
1798
 
1799
  <table>
@@ -1912,7 +1913,7 @@
1912
 
1913
  <h2>Finding the Best Training Configuration</h2>
1914
 
1915
- <p>We’ve now covered all the parallelism techniques that are actually used to distribute and train larger models as well as how and why they can be combined together. There remain a general question: which ones should we choose in the end and how to decide on a specific combination?</p>
1916
 
1917
  <p>We touched this a little bit in the previous section but let's now walk in details through a possible decision process, step by step, keeping in mind that you'll always have to run a few experiments to find the definitive optimal setup for your compute cluster given its various physical properties, network bandwidth, GPUs per node, memory per GPU, etc.</p>
1918
 
@@ -2015,7 +2016,7 @@
2015
 
2016
  <h3>Lessons learned on benchmarking</h3>
2017
 
2018
- <p>Our goal for this book was not only to discuss theory and implementations but provide actual data points as well. So the plan was simple: let's run every possible distributed configuration for every model and a number of cluster sizes (namely 1-64 nodes of 8xH100s). Even after excluding impossible configuration we still needed to run thousands of experiments. </p>
2019
 
2020
  <p>
2021
  On paper this sounds easy enough: we can easily launch big arrays of jobs on our cluster. However, as soon as we launched the first batches of experiments, troubles began:
@@ -2272,7 +2273,7 @@
2272
 
2273
  </ol>
2274
 
2275
- <p>Let’s talk about one of the most frequent technique we can use in CUDA: optimizing memory access. The global memory in GPUs (the largest memory in our above graph) has a long latency and low bandwidth in comparison to the cache which often creates a major bottleneck for most applications. Efficiently accessing data from global memory can improve performance by a lot.</p>
2276
 
2277
  <h4>Memory Coalescing</h4>
2278
 
@@ -2410,7 +2411,7 @@
2410
 
2411
  <p>So it seems warps are stalling waiting for shared memory accesses to return! To solve this issue we can apply a technique called <strong>Thread Coarsening</strong> which involves merging several threads into a single coarsened thread. This will significantly reduce shared memory accesses as each coarsened thread can handle multiple output elements.</p>
2412
 
2413
- <p>Let's briefly go through a last important consideration when writing or improving custom kernels: <strong>Minimizing Control Divergence</strong>.</p>
2414
 
2415
  <h4>Minimizing Control Divergence</h4>
2416
 
@@ -2565,13 +2566,13 @@
2565
 
2566
  <p>We can see that float32 spans 80 orders of magnitude and float16 sacrifices a lot of range while bfloat16 maintains the full range. The two float8 formats reduce the range even further where e5e2 can maintain float16 range and e4m3 has an even smaller ranger.</p>
2567
 
2568
- <p>How come some formats are able to maintain the range and others not? Let’s investigate the resolution by plotting 10,000 points between 1 and 2. Each point will be rounded to the nearest representable number in each format:</p>
2569
 
2570
  <p><img alt="image.png" src="/assets/images/mixedprecision_2.png" /></p>
2571
 
2572
  <p>We can see here that bfloat16 maintained the range of float32 over float16 but did this with the cost of sacrificing more precision. In case of float8 the situation is even more dire as e4m3 can represent 7 and e5m2 only 3 number on the interval 1-2.</p>
2573
 
2574
- <p>A common metric to measure a formats resolution is epsilon: the first representable number after <d-math>1.00</d-math>. We can see that for the float32 format <d-math>10^{-4}</d-math> is an upper bound (it’s actually <d-math>1.19^{-7}</d-math>). For float16 it is ~ <d-math>10^{-3}</d-math> and for bfloat 10x higher still.</p>
2575
 
2576
  <p>The idea of mixed precision training is to use some of these lower precisions formats while maintaining the performance of full precision training. </p>
2577
 
@@ -3046,7 +3047,7 @@
3046
  print(f"After broadcast on rank {dist.get_rank()}: {tensor}")
3047
 
3048
  init_process()
3049
- example_broadcast()
3050
  </d-code>
3051
 
3052
 
@@ -3379,7 +3380,7 @@
3379
 
3380
  <p>There are a few finer points in the decision tree that we leave to the reader to explore in the PyTorch guide referenced above.</p>
3381
 
3382
- <p>Now that we covered the fundamental operations for distributed training and you should now be ready to follow the blog post easily.</p>
3383
 
3384
  <h3>A1: Distributed Training Profiling</h3>
3385
 
@@ -3828,8 +3829,7 @@
3828
  }
3829
  if (level === 0)
3830
  ToC += '<div>' + link + '</div>';
3831
- else
3832
- // else if (level === 1)
3833
  ToC += '<li>' + link + '</li>';
3834
  }
3835
 
 
73
  </d-contents>
74
 
75
  <p>
76
+ Thousands of GPUs humming in perfect harmony. That's what it takes to train today's most powerful AI models – a symphony of computing power that until recently was the exclusive domain of elite research labs. Open source has transformed this landscape, but not completely. Yes, you can download the latest <a href="https://huggingface.co/meta-llama">Llama</a> or <a href="https://huggingface.co/deepseek-ai">DeepSeek</a> models. Yes, you can read their <a href="https://ai.meta.com/research/publications/the-llama-3-herd-of-models/">technical</a> and <a href="https://github.com/deepseek-ai/DeepSeek-R1/blob/main/DeepSeek_R1.pdf">experiment</a> reports. But the most challenging part – the training code, the knowledge and technics necessary to coordinate GPUs to train these massive systems – remains shrouded in complexity and spread around a series of disconnected papers and often private codebases.
77
  </p>
78
+ <aside>Reading time: 2-4 days. For the best reading experience, we recommend not using a mobile phone.</aside>
79
  <p>
80
+ This open-source book is here to changes that. Starting from the basics, we'll walk you through the knowledge necessary to scale the training of large language models from one GPU to tens, hundreds and even thousands of GPUs, illustrating theory with practical code examples and reproducible benchmarks.
81
  </p>
82
 
83
+ <p>As the size of the clusters used to train these models grew, various techniques such as data parallelism, tensor parallelism, pipeline parallelism or context parallelism as well as ZeRO or kernel fusion have been invented to makes sure that GPUs are highly utilized at all times. This significantly reduces training time and makes the best use of this expensive hardware. Even more, as the challenge of scaling up AI training goes beyond just building the initial models and teams have found that fine-tuning large models on specialized data often produces the best results, generally involving the same distributed training techniques. In this book we'll progressively go over all of these techniques –from the simplest to the most raffined one– while keeping a single story-line to understand where each method comes from.</p>
84
 
85
  <aside>If you have questions or remarks open a discussion on the <a href="https://huggingface.co/spaces/nanotron/ultrascale-playbook/discussions?status=open&type=discussion">Community tab</a>!</aside>
86
 
87
+ <p>We'll assumes you have some simple basic knowledge about current LLM architecture and are roughtly familiar with how deep learning model are trained, but you can be generally new to distributed training. If needed, the basics of model training can be found in great courses found at <a href="https://www.deeplearning.ai">DeepLearning.ai</a> or on the <a href="https://pytorch.org/tutorials/beginner/basics/intro.html">PyTorch tutorial sections</a>. This book can be seen as the second part of a trilogy following our first blog on processing data for pre-training, the so-called β€œ<a href="https://huggingface.co/spaces/HuggingFaceFW/blogpost-fineweb-v1">FineWeb blog post</a>”. Having read both blog posts, you should have almost all the core knowledge needed to fully understand how how performing LLMs are being built nowadays, just missing some final spices regarding data mixing and architecture choices to complete the recipe (stay tuned for part three…).</p>
88
 
89
  <aside>We are extremely thankful to the whole <a href="https://distill.pub/">distill.pub</a> team for creating
90
  the template on which we based this blog post.</aside>
91
 
92
  <p>The book is built on the following <strong>three general foundations</strong>:</p>
93
 
94
+ <p><strong>Quick intros on theory and concepts:</strong> before diving into code and experiments, we want to understand how each method works at a high level and what it’s advantages and limits are. You’ll learn about which parts of a language model eat away your memory and when during training it happens. You’ll learn how we can solve memory constraints by parallelizing the models and increase the throughput by scaling up GPUs. As a result you'll understand how the following widget to compute the memory breakdown of a transformer model works: </p>
95
  <aside>Note that we're still missing Pipeline Parallelism in this widget. To be added as an exercise for the reader.</aside>
96
 
97
  <div class="large-image-background-transparent">
 
268
  <ol>
269
  <li><strong>Memory Usage</strong>: it's a hard limitation - if a training step doesn't fit in memory, training cannot proceed</li>
270
  <li><strong>Compute Efficiency</strong>: we want our hardware to spend most time computing, so we need to reduce time spent on data transfers or waiting for other GPUs to perform work.</li>
271
+ <li><strong>Communication overhead</strong>: we want to minimize communication overhead as it keeps GPUs idle. To archieve this we will try to make best use of intra-node (fast) and inter-node (slower) bandwidths as well as overlap communication with compute as much as possible.</li>
272
  </ol>
273
  <p>In many places we'll see that we can trade one of these (computation, communication, memory) for another (e.g. recomputation or Tensor Parallelism). Finding the right balance is key to scaling training.</p>
274
  <p>
 
341
 
342
  <aside>For instance, during DeepSeek-V3/R1 training β€œthe batch size is gradually increased from 3072 input sequences to 15360 in the training of the first 469B tokens, and then keeps at 15360 input samples in the remaining training”.</aside>
343
 
344
+ <p>Batch size also affects the time it takes to train on a given text dataset: a small batch size will require more optimizer steps to train on the same amount of samples. Optimizer steps are costly (in compute time) and the total time to train will thus increase compared to using a larger batch size. This being said, note that the batch size can often be adjusted quite largely around the optimal batch size without major impact to the performance of the model, i.e. the sensitivity of final model performances to the exact batch size value is usually rather low around the optimal batch size.</p>
345
 
346
  <p>In the LLM pretraining community, batch sizes are commonly reported in terms of tokens rather than in number of samples (<d-math>bst</d-math> = Batch Size Tokens), this makes training numbers generally independent of the exact input sequence length used during the training.</p>
347
 
 
353
 
354
  <p>From here onward we’ll show the formulas for the batch size in terms of samples but you can always get its token-unit counterpart by multiplying it with the sequence length.</p>
355
 
356
+ <p>A sweet spot for recent LLM training is typically on the order of 4-60 million tokens per batch. The batch size as well as the training corpus have been steadily increasing over the years: Llama 1 was trained with a batch size of ~4M tokens for 1.4 trillions tokens while DeepSeek was trained with a batch size of ~60M tokens for 14 trillion tokens.</p>
357
 
358
  <p><strong>And our first challenge is already coming ahead when scaling the training of our model to these large batch sizes: out-of-memory issues. What should we do when our GPU doesn’t have enough memory to hold a full batch of our target batch size?</strong></p>
359
 
 
361
 
362
  <h3>Memory usage in Transformers</h3>
363
 
364
+ <p>When training a neural network model, one store several items in memory:</p>
365
 
366
  <ul>
367
  <li>Model weights</li>
 
374
  <p class="note-box-title">πŸ“ Note</p>
375
  <div class="note-box-content">
376
  <p>
377
+ You would think for a model you could compute the memory requirements exactly but there are a few additional memory occupants that makes it hard to be exact:
378
  <ul>
379
  <li>CUDA Kernels typically require 1-2 GB of GPU memory, which you can quickly verify by running <code>import torch; torch.ones((1, 1)).to("cuda")</code> and then checking the GPU memory with <code>nvidia-smi</code>.</li>
380
  <li>Some rest memory usage from buffers, intermediate results and some memory that can’t be used due to fragmentation</li>
 
385
 
386
  <p>These items are stored as tensors which come in different <em>shapes</em> and <em>precisions</em>. The <em>shapes</em> are determined by hyper-parameters such as batch size, sequence length, model hidden dimensions, attention heads, vocabulary size, and potential model sharding as we’ll see later. <em>Precision</em> refers to formats like FP32, BF16, or FP8, which respectively require 4, 2, or 1 byte to store each single value in the tensor. We will have a full discussion of the different precisions and their trade-offs in the <a target="_self" href="#mixed_precision_training">Mixed Precision Training</a> section, for now let's just keep in mind that the memory requirements for these various format will be different and that will impact the memory usage of the items we need to store.</p>
387
 
388
+ <p>So how can I quickly determine memory usage from these variable? One simple way is to do this empirically and just measure it.</p>
389
 
390
  <h4>Profiling the memory usage</h4>
391
 
392
+ <p>Using the Pytorch profiler we can understand how memory is allocated througho ut training. We can see that memory utilization is not a static thing but varies a lot during training and during a training step:</p>
393
 
394
  <aside>Check out <a target="_self" href="#a1%3A_distributed_training_profiling" class="">A1: Distributed Training Profiling</a> for a walkthrough how to profile your model.</aside>
395
 
 
434
  \end{aligned}
435
  </d-math>
436
 
437
+ <p>Now let’s have look how things change if we use a lower precision. For stability reason (see <a target="_self" href="#mixed_precision_training">the mixed-precision training section below</a>) we often don't use full low precision training but a mix of higher and lower precision called "mixed precision"<d-cite bibtex-key="micikevicius2018mixedprecisiontraining"></d-cite>. The default nowadays for mixed precision training is to generally use BF16 for most of the computations –requiring 2 bytes per parameter and gradient– as well as an additional copy of the model weights and gradients in FP32, thus 12 bytes per parameter in total. In addition to the parameters and gradient, we need to store the optimizer states: for the Adam optimizer, this requires the momentum and the variance usually stored in FP32 for numerical stability, each using 4 bytes. </p>
438
 
439
  <aside>See some more details below when we cover the ZeRO methods.</aside>
440
 
 
504
 
505
  <p>As we can see, as soon as we reach <strong>7B</strong> (!), weights and optimizer requirements already starts to add up significantly and exceed the size of a typical GPU memory, e.g. 80GB for a H100 GPU.</p>
506
 
507
+ <p>But for now, let’s start with models which still fits in a single GPU, take a look at the last big contributor to our memory budget: the activation memory.</p>
508
 
509
  <h4>Activations memory</h4>
510
 
 
518
 
519
  <p>For the exact derivation of the numbers, you can follow this original NVIDIA paper on recomputation <d-cite bibtex-key="korthikanti2022recomputation"></d-cite>, it essentially requires you to do some accounting of all the sizes of intermediate activations between each operation in a transformer layer.</p>
520
 
521
+ <p>An interesting observation here is how the memory is not static for a given model but it scales linearly with both the sequence length and batch size. This means the activation memory is the part which will blow up when we increase our batch size or train with longer sequences. We can use this equation to look at how memory usage changes for various sequence lengths for example for Llama models (<code>bs=1</code>):</p>
522
 
523
  <div class="l-body-outset" id="fragment-memusage_activations"></div>
524
  <!-- <script>
 
539
 
540
  <h3>Activation recomputation</h3>
541
 
542
+ <p>The general idea behind <strong><em>activation recomputation</em></strong> – also called <em>gradient checkpointing</em> or <em>rematerialization</em> – is to discard some activations during the forward pass to save memory and spend some extra compute to recompute these on the fly during the backward pass. Without recomputation, we store every hidden state between two learnable operations (e.g. feed-forward, layernorm etc.), such that we can use them during the backward pass to compute gradients. When we use recomputation we typically will only store activations at a few key points along the model architecture, discard the rest of activations and recompute them on the fly during the backward pass from the nearest saved activations, basically performing again a sub-part of the forward pass to trade of memory for compute. It generally looks like this:</p>
543
 
544
  <div class="svg-container" id="svg-activation_recomputation"> </div>
545
  <div class="info" id="svg-activation_recomputation-info">Hover over the network elements to see their details</div>
 
548
 
549
  <ul>
550
  <li><strong>Full</strong>: We checkpoint activations at the transition point between each layer of the Transformer model. This is usually called the <code>full</code> strategy since it requires a forward pass through each layer essentially adding a full forward pass during the backward pass. This strategy saves the most memory but is the most expensive one in terms of compute. It generally increases the compute cost and time by up to 30-40% which is very noticeable.</li>
551
+ <li><strong>Selective</strong>: In general we can do better than full. The authors of the recomputation paper<d-cite bibtex-key="korthikanti2022recomputation"></d-cite> did a detailed analysis studying which activations grow the largest and have the cheapest recomputation cost in terms of FLOPs. Turns out that the attention computations fall in that category, and thus we can usually discard them and focus on checkpointing expensive the feedforward computations. For a GPT-3 (175B) model this means <strong>70% activation memory reduction at a 2.7% compute cost</strong>.</li>
552
  </ul>
553
 
554
  <aside>In recent models like DeepSeek V3, selective checkpointing is performed, storing even a smaller size of attention activation β€”using so-called β€œMulti-Head Latent Attention” (MLA)– to optimize activation memory usage.</aside>
 
611
 
612
  <p>But if you’ve carefully followed, you probably noticed that the forward/backward passes for each micro-batch can actually be run in parallel. Forward/backward passes are independent from each other, with independent input samples being the only difference. Seems like it’s time to start extending our training to more than one GPU! </p>
613
 
614
+ <p>Before that, let's quickly see how we can vizualise computation and communication with a short tour of one of the most usefull tool in the distributed training toolbox: the <strong>profiler</strong>. This tool will be extremely usefull to understand and validate how communications between GPUs and compute are happening and where bottlenecks are.</p>
615
 
616
  <h4>Profiling GPU compute and communication</h4>
617
 
 
802
 
803
  <p>While data parallelism nicely overlaps the all-reduce gradient synchronization with backward computation to save time, this benefit starts to break down at large scales. Why? Because as we add more and more GPUs (hundreds or thousands), the overhead of coordinating between them grows significantly and the network requirements are becoming too large for the benefits. As a result, our setup will become less and less efficient which each additional GPU we add to the system.</p>
804
 
805
+ <p>Lets see this happening in practice with some benchmark:</p>
806
 
807
  <!-- <p><img alt="image.png" src="/assets/images/dp_scaling.svg"/></p> -->
808
  <div class="l-body-outset" id="fragment-dp_scaling"></div>
 
864
 
865
  <h4>Memory usage revisited</h4>
866
 
867
+ <p>You likely remember from <a target="_self" href="#memory_usage_in_transformers"> our previous section</a> the memory usage of optimizer states, gradients, and parameters during a standard training. Lets call our model's parameters count <d-math>\Psi</d-math> (previously N but here we use the original ZeRO paper notation). In <a target="_self" href="#mixed_precision_training">Mixed Precision Training</a> (more details in a later section) with the Adam optimizer, the memory usage for each item we need to store is:</p>
868
 
869
  <ul>
870
  <li>Model’s parameters (half precision i.e. bf16/fp16): <d-math>2\Psi</d-math></li>
 
902
  </ul>
903
  <aside>Note: reduce-scatter is 2 times faster than all reduce! <em>Yay, a third communication primitive!</em></aside>
904
 
905
+ <p>You may be wondering what is this "reduce-scatter" operation and how this all look so lets try to make this more graphical with the figure below. We'll go over all the steps of a forward/backward pass cycle:</p>
906
 
907
  <p><img alt="dp_zero1.gif" src="/assets/images/dp_zero1.gif" /></p>
908
 
 
1320
  <ul>
1321
  <li>for both methods we notice the biggest performance drop when we move from TP=8 to TP=16, because that’s when we move from only communicating within a single node (NVLink), to communicating inter-nodes (EFA)</li>
1322
  <li>the memory savings in activations when using TP with SP helps us fit far bigger batches than TP alone</li>
1323
+ <li>the memory savings in activations when using TP with SP helps us fit far bigger batches than TP alone</li>
1324
  </ul>
1325
 
1326
  <p><strong>We have seen how TP helps us shard activations across several GPUs by splitting the attention and feedforward operations along the hidden dimension and how SP is a natural complement for the remaining operations by splitting along the sequence dimension.</strong></p>
 
1354
 
1355
  <!-- <p><img alt="image.png" src="/assets/images/cp_memoryusage.svg" /></p> -->
1356
 
1357
+ <p>The core idea of Context Parrallelism is to apply a similar idea to the Sequence Parallelism approach (aka to split along the sequence length) but to the modules where we already apply Tensor Parallelism. We will thus split these modules along two dimensions, thereby also reducing the effect of sequence length. You will find this approach quite intuitive after all we’ve already convered but... there is a trick to it so stay awake!</p>
1358
 
1359
+ <p>For Context Parallelism; just like Sequence Parallelism, we’ll split the input along the sequence dimension but we now apply this splitting along the full model, instead of only the sequence parallel regions of the model as we’ve done previous with Tensor + Sequence Parallelism.</p>
1360
 
1361
  <!-- <p><img alt="cp_8Bmemoryusage.svg" src="/assets/images/cp_8Bmemoryusage.svg" /></p>
1362
  -->
 
1364
 
1365
  <p>There is one important exception though as we we need to pay particular attention to the <strong>Attention blocks</strong> (haha.. pun intended :D). In the attention module each token needs to access key/value pairs from <strong>all</strong> other sequence tokens or in the case of causal attention at least attends to each previous token.</p>
1366
 
1367
+ <p>Because Context Parallelism splits the inputs along the sequence dimension across GPUs, the attention module will requires full communication between GPUs to exchange the necessary key/value data.</p>
1368
 
1369
  <p>That sounds very expensive if we do it naively. Is there a way to do this rather efficiently and fast! Thankfully there is: a core technique to handle this communication of key/value pairs efficiently is called <em>Ring Attention</em>.</p>
1370
 
 
1504
  </div>
1505
  <p>The remaining idle time is indicated in grey and usually called the β€œbubble” and the sight of this probably break your heart after we spent so much time optimizing throughput.</p>
1506
 
1507
+ <p>We can quantify how efficient a pipeline setup is by looking at how much time we loose because of the bubble. Let’s say <d-math>t_f</d-math> and <d-math>t_b</d-math> are the times for the forward and backward pass, respectively, as measured for one microbatch and one stage of the pipeline (a simple assumption is often to have <d-math>t_b \approx 2 \times t_f</d-math> which you can see on the above graph). If we could perfectly parallelize the ideal total time would be <d-math>t_{id}=t_f + t_b</d-math>. However, we can count on the graph that due to the pipeline bubble there is additional time of <d-math>t_{pb}=(p-1)*(t_f+t_b)</d-math> (where <d-math>p</d-math> is the degree of pipeline parallelism, i.e the number of GPU on the above graph) ie. the time each GPU is waiting while other GPUs are computing.</p>
1508
 
1509
  <p>We can compute the ratio of the additional bubble time over the ideal time:
1510
  </p>
 
1592
 
1593
  <p><img alt="pp_1f1b_interleaved.svg" src="/assets/images/pp_1f1b_interleaved.svg" /></p>
1594
 
1595
+ <div class="figure-legend"><p>An example of interleaved pipeline parallelism for a model with layers distributed across 4 GPUs. Numbers still correspond to the microbatches IDs but for clarity we've colored differently the first and the last layers of the model to illustrate how layers are spread accross GPUs.</p>
1596
  </div>
1597
 
1598
  <p>As a consequence we see additional communications happening as the model goes several times through each GPU for the same computation that previously just took one pass. However, each forward and backward pass is divided by a factor of <d-math>v</d-math>, where <d-math>v</d-math> is the number of stages or model chunks per GPUs as we are able to better interleave forward and backward passes. </p>
 
1746
  </table>
1747
  </div>
1748
 
1749
+ <p>As you can see, ZeRO-3 and PP sove the same challenge but involve different approaches and the choice between both will depend whether you decide to focus communication either on weights or on activations. While they can be combined, it's not often done in practice as doing so requires increasing the global batch size significantly to amortize the communication costs, creating a tradeoff between global batch size, model size, network bandwidth, and training efficiency. If you decide to combine them, ZeRO-3 should be configured to keep the weights in memory during the series of PP micro-batches to minimize as much as possible un-necessary communication overhead.</p>
1750
 
1751
  <p>On the other hand, ZeRO-1 and ZeRO-2, which focus on optimizer states and gradients, can be easily combined with Pipeline Parallelism and are complementary to it. Combining them don't raise any particular new challenge. For instance, the training of DeepSeek-v3 used PP combined with ZeRO-1 (sic).</p>
1752
 
 
1794
  <li>Tensor Parallelism (and Sequence Parallelism) affects computation throughout the entire model by sharding both weights and activations.</li>
1795
  <li>Context Parallelism primarily impacts attention layers since that's where cross-sequence communication is required, with other layers operating independently on sharded sequences.</li>
1796
  <li>Expert Parallelism primarly affects the MoE layers (which replace standard MLP blocks), leaving attention and other components unchanged</li>
1797
+ <li>Pipeline Parallelism and ZeRO are not especially specific to any sub-module or component with the exception that modules and layers need to be balanced in Pipaline Parallelism, the first and last layers are thus often treated differently due to the additional embedding layers.</li>
1798
  </ul>
1799
 
1800
  <table>
 
1913
 
1914
  <h2>Finding the Best Training Configuration</h2>
1915
 
1916
+ <p>We’ve now covered all the parallelism techniques that are actually used to distribute and training larger models as well as how and why they can be combined together. There remain a general question: which ones should we choose in the end and how to decide on a specific combination?</p>
1917
 
1918
  <p>We touched this a little bit in the previous section but let's now walk in details through a possible decision process, step by step, keeping in mind that you'll always have to run a few experiments to find the definitive optimal setup for your compute cluster given its various physical properties, network bandwidth, GPUs per node, memory per GPU, etc.</p>
1919
 
 
2016
 
2017
  <h3>Lessons learned on benchmarking</h3>
2018
 
2019
+ <p>Our goal for this book was not only to discuss theory and implementations but provide actual data points as well. So the plan was simple: lets run every possible distributed configuration for every model and a number of cluster sizes (namely 1-64 nodes of 8xH100s). Even after excluding impossible configuration we still needed to run thousands of experiments. </p>
2020
 
2021
  <p>
2022
  On paper this sounds easy enough: we can easily launch big arrays of jobs on our cluster. However, as soon as we launched the first batches of experiments, troubles began:
 
2273
 
2274
  </ol>
2275
 
2276
+ <p>Let’s talk about one of the most frequent technique we can use in CUDA: optimizing memory access. The global memory in GPUs (the largest memory in our above graph) has a long latency and low bandwidth in comparison to the cache which often creates a major bottleneck for most applications. Efficiently accessing data from global memory can improve a lot the performance.</p>
2277
 
2278
  <h4>Memory Coalescing</h4>
2279
 
 
2411
 
2412
  <p>So it seems warps are stalling waiting for shared memory accesses to return! To solve this issue we can apply a technique called <strong>Thread Coarsening</strong> which involves merging several threads into a single coarsened thread. This will significantly reduce shared memory accesses as each coarsened thread can handle multiple output elements.</p>
2413
 
2414
+ <p>Let's briefly mentionned a last important consideration when writing or improving custom kernels: <strong>Minimizing Control Divergence</strong>.</p>
2415
 
2416
  <h4>Minimizing Control Divergence</h4>
2417
 
 
2566
 
2567
  <p>We can see that float32 spans 80 orders of magnitude and float16 sacrifices a lot of range while bfloat16 maintains the full range. The two float8 formats reduce the range even further where e5e2 can maintain float16 range and e4m3 has an even smaller ranger.</p>
2568
 
2569
+ <p>How come some format are able to maintain the range and other not? Let’s investigate the resolution by plotting 10,000 points between 1 and 2. Each point will be rounded to the nearest representable number in each format:</p>
2570
 
2571
  <p><img alt="image.png" src="/assets/images/mixedprecision_2.png" /></p>
2572
 
2573
  <p>We can see here that bfloat16 maintained the range of float32 over float16 but did this with the cost of sacrificing more precision. In case of float8 the situation is even more dire as e4m3 can represent 7 and e5m2 only 3 number on the interval 1-2.</p>
2574
 
2575
+ <p>A common metric to measure a formats resolution is epsilon: the first representable number after <d-math>1.00</d-math>. We can see that for the float32 format <d-math>10^{-4}</d-math> is an upper bound (it’s actually <d-math>1.19^{-7}</d-math>). For float16 it is <d-math>\tilde 10^{-3}</d-math> and for bfloat 10x higher still.</p>
2576
 
2577
  <p>The idea of mixed precision training is to use some of these lower precisions formats while maintaining the performance of full precision training. </p>
2578
 
 
3047
  print(f"After broadcast on rank {dist.get_rank()}: {tensor}")
3048
 
3049
  init_process()
3050
+ example_broadcats()
3051
  </d-code>
3052
 
3053
 
 
3380
 
3381
  <p>There are a few finer points in the decision tree that we leave to the reader to explore in the PyTorch guide referenced above.</p>
3382
 
3383
+ <p>Now that we covered the fundamental operations for distributed training and when you should should be ready to follow the blog post easily.</p>
3384
 
3385
  <h3>A1: Distributed Training Profiling</h3>
3386
 
 
3829
  }
3830
  if (level === 0)
3831
  ToC += '<div>' + link + '</div>';
3832
+ else if (level === 1)
 
3833
  ToC += '<li>' + link + '</li>';
3834
  }
3835
 
src/index.js CHANGED
@@ -1,11 +1,9 @@
1
  // import { plotClusters } from './clusters'
2
  import { init_memory_plot } from './memory'
3
  import { loadFragments } from './fragmentLoader'
4
- import { syncHFSpacesURLHash } from './syncHFSpacesURLHash'
5
 
6
  document.addEventListener("DOMContentLoaded", () => {
7
  console.log("DOMContentLoaded");
8
  loadFragments();
9
  init_memory_plot();
10
- syncHFSpacesURLHash();
11
  }, { once: true });
 
1
  // import { plotClusters } from './clusters'
2
  import { init_memory_plot } from './memory'
3
  import { loadFragments } from './fragmentLoader'
 
4
 
5
  document.addEventListener("DOMContentLoaded", () => {
6
  console.log("DOMContentLoaded");
7
  loadFragments();
8
  init_memory_plot();
 
9
  }, { once: true });
src/syncHFSpacesURLHash.js DELETED
@@ -1,124 +0,0 @@
1
- const queryArg = "section";
2
-
3
- function syncHFSpacesURLHash() {
4
- // Handle explicit section requests (don't update hash automatically on load)
5
- const hasExplicitRequest = handleExplicitSectionRequest();
6
-
7
- // Set up hash change monitoring
8
- updateHashBasedOnHashChange();
9
-
10
- // Always set up scroll monitoring to update hash during scrolling
11
- setupScrollMonitoring();
12
-
13
- // If no explicit request, we don't update the hash on initial load
14
- // The hash will only start updating when the user scrolls
15
- }
16
-
17
- function handleExplicitSectionRequest() {
18
- // Check for section parameter in URL
19
- const urlParams = new URLSearchParams(window.location.search);
20
- const sectionId = urlParams.get(queryArg);
21
-
22
- // If we have an explicit section request
23
- if (sectionId) {
24
- const targetElement = document.getElementById(sectionId);
25
- if (targetElement) {
26
- // Slight delay to ensure the browser doesn't try to do its own scrolling first
27
- setTimeout(() => {
28
- targetElement.scrollIntoView();
29
- history.replaceState(null, null, `#${sectionId}`);
30
- }, 100);
31
- }
32
- return true;
33
- }
34
-
35
- // No explicit section parameter found
36
- return false;
37
- }
38
-
39
- function setupScrollMonitoring() {
40
- // Variables to manage throttling
41
- let isScrolling = false;
42
- let lastKnownScrollPosition = 0;
43
- let initialScroll = true;
44
-
45
- // Add the scroll event listener
46
- window.addEventListener('scroll', function() {
47
- lastKnownScrollPosition = window.scrollY;
48
-
49
- if (!isScrolling) {
50
- window.requestAnimationFrame(function() {
51
- // Skip the first scroll event which might be browser's automatic scroll
52
- // to a hash on page load
53
- if (initialScroll) {
54
- initialScroll = false;
55
- } else {
56
- updateHashBasedOnScroll(lastKnownScrollPosition);
57
- }
58
- isScrolling = false;
59
- });
60
- }
61
-
62
- isScrolling = true;
63
- });
64
- }
65
-
66
- // Function to update the URL hash based on scroll position
67
- function updateHashBasedOnScroll(scrollPosition) {
68
- const closestHeading = findClosestHeading(scrollPosition);
69
-
70
- // Update the URL hash if we found a closest element
71
- if (closestHeading && closestHeading.id) {
72
- // Only update if the hash is different to avoid unnecessary operations
73
- if (window.location.hash !== `#${closestHeading.id}`) {
74
- silentlyUpdateHash(closestHeading.id);
75
- postMessageToHFSpaces(closestHeading.id);
76
- }
77
- }
78
- }
79
-
80
- // Find the closest heading to the current scroll position
81
- function findClosestHeading(scrollPosition) {
82
- // Get only heading elements with IDs that we want to track
83
- const headingsWithIds = Array.from(document.querySelectorAll('h1[id], h2[id], h3[id], h4[id], h5[id], h6[id]'));
84
-
85
- // Skip if there are no headings with IDs
86
- if (headingsWithIds.length === 0) return null;
87
-
88
- // Find the element closest to the middle of the viewport
89
- let closestHeading = null;
90
- let closestDistance = Infinity;
91
- const viewportMiddle = scrollPosition + window.innerHeight / 2;
92
-
93
- // Iterate through all headings to find the closest one
94
- headingsWithIds.forEach(heading => {
95
- const headingTop = heading.getBoundingClientRect().top + scrollPosition;
96
- const distance = Math.abs(headingTop - viewportMiddle);
97
-
98
- if (distance < closestDistance) {
99
- closestDistance = distance;
100
- closestHeading = heading;
101
- }
102
- });
103
-
104
- return closestHeading;
105
- }
106
-
107
- // Update hash without triggering scroll or other side effects
108
- function silentlyUpdateHash(id) {
109
- history.replaceState(null, null, `#${id}`);
110
- }
111
-
112
- function updateHashBasedOnHashChange() {
113
- window.addEventListener('hashchange', () => {
114
- const elementId = window.location.hash.slice(1);
115
- postMessageToHFSpaces(elementId);
116
- });
117
- }
118
-
119
- function postMessageToHFSpaces(elementId) {
120
- const parentOrigin = "https://huggingface.co";
121
- window.parent.postMessage({ queryString: `${queryArg}=${elementId}` }, parentOrigin);
122
- }
123
-
124
- export { syncHFSpacesURLHash };