hace 5 meses · 661a6e84ee
--- a/appendix-D/01_main-chapter-code/appendix-D.ipynb
+++ b/appendix-D/01_main-chapter-code/appendix-D.ipynb
@@ -615,7 +615,7 @@
 
				     "                if global_step > warmup_steps:\n",
			
 
				     "                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  \n",
			
 
				     "            else:\n",
			
 
				-    "                if global_step >= warmup_steps:  # the book originally used global_step > warmup_steps, which lead to a skipped clipping step after warmup\n",
			
 
				+    "                if global_step >= warmup_steps:  # the book originally used global_step > warmup_steps, which led to a skipped clipping step after warmup\n",
			
 
				     "                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)\n",
			
 
				     "                \n",
			
 
				     "            optimizer.step()\n",
			
--- a/pkg/llms_from_scratch/appendix_d.py
+++ b/pkg/llms_from_scratch/appendix_d.py
@@ -66,7 +66,7 @@ def train_model(model, train_loader, val_loader, optimizer, device,
 
				                 if global_step > warmup_steps:
			
 
				                     torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
			
 
				             else:
			
 
				-                if global_step >= warmup_steps:  # the book originally used global_step > warmup_steps, which lead to a skipped clipping step after warmup
			
 
				+                if global_step >= warmup_steps:  # the book originally used global_step > warmup_steps, which led to a skipped clipping step after warmup
			
 
				                     torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
			
 
				 
			
 
				             optimizer.step()
			
--- a/pkg/llms_from_scratch/ch03.py
+++ b/pkg/llms_from_scratch/ch03.py
@@ -160,7 +160,7 @@ class PyTorchMultiHeadAttention(nn.Module):
 
				     def __init__(self, d_in, d_out, num_heads, dropout=0.0, qkv_bias=False):
			
 
				         super().__init__()
			
 
				 
			
 
				-        assert d_out % num_heads == 0, "embed_dim is indivisible by num_heads"
			
 
				+        assert d_out % num_heads == 0, "d_out is indivisible by num_heads"
			
 
				 
			
 
				         self.num_heads = num_heads
			
 
				         self.head_dim = d_out // num_heads