1 jaar geleden · b827bf4eea
--- a/ch03/01_main-chapter-code/ch03.ipynb
+++ b/ch03/01_main-chapter-code/ch03.ipynb
@@ -1608,7 +1608,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 37,
			
 
				+   "execution_count": 39,
			
 
				    "id": "110b0188-6e9e-4e56-a988-10523c6c8538",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
@@ -1672,8 +1672,8 @@
 
				     "        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head\n",
			
 
				     "        # Original mask truncated to the number of tokens and converted to boolean\n",
			
 
				     "        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]\n",
			
 
				-    "        # Unsqueeze the mask twice to match dimensions\n",
			
 
				-    "        mask_unsqueezed = mask_bool.unsqueeze(0).unsqueeze(0)\n",
			
 
				+    "        # Unsqueeze the mask to match dimensions\n",
			
 
				+    "        mask_unsqueezed = mask_bool.unsqueeze(0)\n",
			
 
				     "        # Use the unsqueezed mask to fill attention scores\n",
			
 
				     "        attn_scores.masked_fill_(mask_unsqueezed, -torch.inf)\n",
			
 
				     "        \n",
			
@@ -1729,7 +1729,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 38,
			
 
				+   "execution_count": 40,
			
 
				    "id": "e8cfc1ae-78ab-4faa-bc73-98bd054806c9",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
@@ -1772,7 +1772,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 39,
			
 
				+   "execution_count": 41,
			
 
				    "id": "053760f1-1a02-42f0-b3bf-3d939e407039",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
@@ -1804,7 +1804,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 40,
			
 
				+   "execution_count": 42,
			
 
				    "id": "08c2a3fd-e674-4d69-9ef4-ea94b788e937",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
@@ -1814,7 +1814,7 @@
 
				        "2360064"
			
 
				       ]
			
 
				      },
			
 
				-     "execution_count": 40,
			
 
				+     "execution_count": 42,
			
 
				      "metadata": {},
			
 
				      "output_type": "execute_result"
			
 
				     }
			
@@ -1847,14 +1847,6 @@
 
				    "source": [
			
 
				     "- See the [./multihead-attention.ipynb](./multihead-attention.ipynb) code notebook, which is a concise version of the data loader (chapter 2) plus the multi-head attention class that we implemented in this chapter and will need for training the GPT model in upcoming chapters."
			
 
				    ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": null,
			
 
				-   "id": "9f5b7a94-78d0-49d5-896f-21696cb331b7",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": []
			
 
				   }
			
 
				  ],
			
 
				  "metadata": {
			
--- a/ch03/01_main-chapter-code/multihead-attention.ipynb
+++ b/ch03/01_main-chapter-code/multihead-attention.ipynb
@@ -278,8 +278,8 @@
 
				     "        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head\n",
			
 
				     "        # Original mask truncated to the number of tokens and converted to boolean\n",
			
 
				     "        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]\n",
			
 
				-    "        # Unsqueeze the mask twice to match dimensions\n",
			
 
				-    "        mask_unsqueezed = mask_bool.unsqueeze(0).unsqueeze(0)\n",
			
 
				+    "        # Unsqueeze the mask to match dimensions\n",
			
 
				+    "        mask_unsqueezed = mask_bool.unsqueeze(0)\n",
			
 
				     "        # Use the unsqueezed mask to fill attention scores\n",
			
 
				     "        attn_scores.masked_fill_(mask_unsqueezed, -torch.inf)\n",
			
 
				     "        \n",
			
--- a/ch04/01_main-chapter-code/gpt.py
+++ b/ch04/01_main-chapter-code/gpt.py
@@ -91,8 +91,8 @@ class MultiHeadAttention(nn.Module):
 
				         attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head
			
 
				         # Original mask truncated to the number of tokens and converted to boolean
			
 
				         mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
			
 
				-        # Unsqueeze the mask twice to match dimensions
			
 
				-        mask_unsqueezed = mask_bool.unsqueeze(0).unsqueeze(0)
			
 
				+        # Unsqueeze the mask to match dimensions
			
 
				+        mask_unsqueezed = mask_bool.unsqueeze(0)
			
 
				         # Use the unsqueezed mask to fill attention scores
			
 
				         attn_scores.masked_fill_(mask_unsqueezed, -torch.inf)
			
 
				 
			
--- a/ch04/01_main-chapter-code/previous_chapters.py
+++ b/ch04/01_main-chapter-code/previous_chapters.py
@@ -80,8 +80,8 @@ class MultiHeadAttention(nn.Module):
 
				         attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head
			
 
				         # Original mask truncated to the number of tokens and converted to boolean
			
 
				         mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
			
 
				-        # Unsqueeze the mask twice to match dimensions
			
 
				-        mask_unsqueezed = mask_bool.unsqueeze(0).unsqueeze(0)
			
 
				+        # Unsqueeze the mask to match dimensions
			
 
				+        mask_unsqueezed = mask_bool.unsqueeze(0)
			
 
				         # Use the unsqueezed mask to fill attention scores
			
 
				         attn_scores.masked_fill_(mask_unsqueezed, -torch.inf)