2 年之前 · ab1261d9b1
--- a/ch02/01_main-chapter-code/ch02.ipynb
+++ b/ch02/01_main-chapter-code/ch02.ipynb
@@ -548,7 +548,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 17,
			
 
				+   "execution_count": 16,
			
 
				    "id": "57c3143b-e860-4d3b-a22a-de22b547a6a9",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
@@ -558,7 +558,7 @@
 
				        "1161"
			
 
				       ]
			
 
				      },
			
 
				-     "execution_count": 17,
			
 
				+     "execution_count": 16,
			
 
				      "metadata": {},
			
 
				      "output_type": "execute_result"
			
 
				     }
			
@@ -569,7 +569,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 19,
			
 
				+   "execution_count": 17,
			
 
				    "id": "50e51bb1-ae05-4aa8-a9ff-455b65ed1959",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
@@ -600,7 +600,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 17,
			
 
				+   "execution_count": 18,
			
 
				    "id": "948861c5-3f30-4712-a234-725f20d26f68",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
@@ -636,32 +636,68 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 18,
			
 
				+   "execution_count": 19,
			
 
				    "id": "effcef79-e0a5-4f4a-a43a-31dd94b9250a",
			
 
				    "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "tokenizer = SimpleTokenizerV2(vocab)\n",
			
 
				+    "\n",
			
 
				+    "text1 = \"Hello, do you like tea?\"\n",
			
 
				+    "text2 = \"In the sunlit terraces of the palace.\"\n",
			
 
				+    "\n",
			
 
				+    "text = \" <|endoftext|> \".join((text1, text2))\n",
			
 
				+    "\n",
			
 
				+    "print(text)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 20,
			
 
				+   "id": "ddfe7346-398d-4bf8-99f1-5b071244ce95",
			
 
				+   "metadata": {},
			
 
				    "outputs": [
			
 
				     {
			
 
				      "data": {
			
 
				       "text/plain": [
			
 
				-       "[1, 7, 364, 1157, 644, 1002, 12, 0, 59, 1015, 983, 1011, 740, 1015, 1, 9]"
			
 
				+       "[1160,\n",
			
 
				+       " 5,\n",
			
 
				+       " 362,\n",
			
 
				+       " 1155,\n",
			
 
				+       " 642,\n",
			
 
				+       " 1000,\n",
			
 
				+       " 10,\n",
			
 
				+       " 1159,\n",
			
 
				+       " 57,\n",
			
 
				+       " 1013,\n",
			
 
				+       " 981,\n",
			
 
				+       " 1009,\n",
			
 
				+       " 738,\n",
			
 
				+       " 1013,\n",
			
 
				+       " 1160,\n",
			
 
				+       " 7]"
			
 
				       ]
			
 
				      },
			
 
				-     "execution_count": 18,
			
 
				+     "execution_count": 20,
			
 
				      "metadata": {},
			
 
				      "output_type": "execute_result"
			
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "tokenizer = SimpleTokenizerV2(vocab)\n",
			
 
				-    "\n",
			
 
				-    "text = \"Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.\"\n",
			
 
				-    "\n",
			
 
				     "tokenizer.encode(text)"
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 19,
			
 
				+   "execution_count": 21,
			
 
				    "id": "0c350ff6-2734-4e84-9ec7-d578baa4ae1b",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
@@ -671,7 +707,7 @@
 
				        "'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'"
			
 
				       ]
			
 
				      },
			
 
				-     "execution_count": 19,
			
 
				+     "execution_count": 21,
			
 
				      "metadata": {},
			
 
				      "output_type": "execute_result"
			
 
				     }
			
@@ -703,7 +739,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 20,
			
 
				+   "execution_count": 22,
			
 
				    "id": "ede1d41f-934b-4bf4-8184-54394a257a94",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
@@ -713,7 +749,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 21,
			
 
				+   "execution_count": 23,
			
 
				    "id": "48967a77-7d17-42bf-9e92-fc619d63a59e",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
@@ -734,7 +770,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 22,
			
 
				+   "execution_count": 24,
			
 
				    "id": "6ad3312f-a5f7-4efc-9d7d-8ea09d7b5128",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
@@ -744,7 +780,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 23,
			
 
				+   "execution_count": 25,
			
 
				    "id": "5ff2cd85-7cfb-4325-b390-219938589428",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
@@ -766,7 +802,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 24,
			
 
				+   "execution_count": 26,
			
 
				    "id": "d26a48bb-f82e-41a8-a955-a1c9cf9d50ab",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
@@ -784,6 +820,76 @@
 
				     "print(strings)"
			
 
				    ]
			
 
				   },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "f63d62ab-4b80-489c-8041-e4052fe29969",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "- Experiments with unknown words:"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 27,
			
 
				+   "id": "ce25cf25-a2bb-44d2-bac1-cb566f433f98",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "[33901, 86, 343, 86, 220, 959]\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "integers = tokenizer.encode(\"Akwirw ier\")\n",
			
 
				+    "print(integers)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 28,
			
 
				+   "id": "3e224f96-41d0-4074-ac6e-f7db2490f806",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "33901 -> Ak\n",
			
 
				+      "86 -> w\n",
			
 
				+      "343 -> ir\n",
			
 
				+      "86 -> w\n",
			
 
				+      "220 ->  \n",
			
 
				+      "959 -> ier\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "for i in integers:\n",
			
 
				+    "    print(f\"{i} -> {tokenizer.decode([i])}\")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 29,
			
 
				+   "id": "766bcf29-64bf-47ca-9b65-4ae8e607d580",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "Akwirw ier\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "strings = tokenizer.decode(integers)\n",
			
 
				+    "print(strings)"
			
 
				+   ]
			
 
				+  },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				    "id": "abbd7c0d-70f8-4386-a114-907e96c950b0",
			
@@ -794,7 +900,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 25,
			
 
				+   "execution_count": 30,
			
 
				    "id": "848d5ade-fd1f-46c3-9e31-1426e315c71b",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
@@ -807,10 +913,10 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "with open('the-verdict.txt', 'r', encoding='utf-8') as f:\n",
			
 
				+    "with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
			
 
				     "    raw_text = f.read()\n",
			
 
				     "\n",
			
 
				-    "enc_text = tokenizer.encode(raw_text, allowed_special={\"<|endoftext|>\"})\n",
			
 
				+    "enc_text = tokenizer.encode(raw_text)\n",
			
 
				     "print(len(enc_text))"
			
 
				    ]
			
 
				   },
			
@@ -825,7 +931,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 26,
			
 
				+   "execution_count": 31,
			
 
				    "id": "e84424a7-646d-45b6-99e3-80d15fb761f2",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
@@ -835,7 +941,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 27,
			
 
				+   "execution_count": 32,
			
 
				    "id": "dfbff852-a92f-48c8-a46d-143a0f109f40",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
@@ -868,7 +974,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 28,
			
 
				+   "execution_count": 33,
			
 
				    "id": "d97b031e-ed55-409d-95f2-aeb38c6fe366",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
@@ -878,12 +984,13 @@
 
				      "text": [
			
 
				       "[290] ----> 4920\n",
			
 
				       "[290, 4920] ----> 2241\n",
			
 
				-      "[290, 4920, 2241] ----> 287\n"
			
 
				+      "[290, 4920, 2241] ----> 287\n",
			
 
				+      "[290, 4920, 2241, 287] ----> 257\n"
			
 
				      ]
			
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "for i in range(1, context_size):\n",
			
 
				+    "for i in range(1, context_size+1):\n",
			
 
				     "    context = enc_sample[:i]\n",
			
 
				     "    desired = enc_sample[i]\n",
			
 
				     "\n",
			
@@ -892,7 +999,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 29,
			
 
				+   "execution_count": 34,
			
 
				    "id": "f57bd746-dcbf-4433-8e24-ee213a8c34a1",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
@@ -902,12 +1009,13 @@
 
				      "text": [
			
 
				       " and ---->  established\n",
			
 
				       " and established ---->  himself\n",
			
 
				-      " and established himself ---->  in\n"
			
 
				+      " and established himself ---->  in\n",
			
 
				+      " and established himself in ---->  a\n"
			
 
				      ]
			
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "for i in range(1, context_size):\n",
			
 
				+    "for i in range(1, context_size+1):\n",
			
 
				     "    context = enc_sample[:i]\n",
			
 
				     "    desired = enc_sample[i]\n",
			
 
				     "\n",
			
@@ -933,7 +1041,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 30,
			
 
				+   "execution_count": 35,
			
 
				    "id": "e1770134-e7f3-4725-a679-e04c3be48cac",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
@@ -941,7 +1049,7 @@
 
				      "name": "stdout",
			
 
				      "output_type": "stream",
			
 
				      "text": [
			
 
				-      "PyTorch version: 2.0.1\n"
			
 
				+      "PyTorch version: 2.1.0\n"
			
 
				      ]
			
 
				     }
			
 
				    ],
			
@@ -960,7 +1068,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 31,
			
 
				+   "execution_count": 36,
			
 
				    "id": "74b41073-4c9f-46e2-a1bd-d38e4122b375",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
@@ -993,7 +1101,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 32,
			
 
				+   "execution_count": 37,
			
 
				    "id": "5eb30ebe-97b3-43c5-9ff1-a97d621b3c4e",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
@@ -1021,18 +1129,18 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 33,
			
 
				+   "execution_count": 38,
			
 
				    "id": "df31d96c-6bfd-4564-a956-6192242d7579",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "with open('the-verdict.txt', 'r', encoding='utf-8') as f:\n",
			
 
				+    "with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
			
 
				     "    raw_text = f.read()"
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 34,
			
 
				+   "execution_count": 39,
			
 
				    "id": "9226d00c-ad9a-4949-a6e4-9afccfc7214f",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
@@ -1048,13 +1156,13 @@
 
				     "dataloader = create_dataloader(raw_text, batch_size=1, max_length=4, stride=1)\n",
			
 
				     "\n",
			
 
				     "data_iter = iter(dataloader)\n",
			
 
				-    "next_batch = next(data_iter)\n",
			
 
				-    "print(next_batch)"
			
 
				+    "first_batch = next(data_iter)\n",
			
 
				+    "print(first_batch)"
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 35,
			
 
				+   "execution_count": 40,
			
 
				    "id": "10deb4bc-4de1-4d20-921e-4b1c7a0e1a6d",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
@@ -1067,8 +1175,8 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "next_batch = next(data_iter)\n",
			
 
				-    "print(next_batch)"
			
 
				+    "second_batch = next(data_iter)\n",
			
 
				+    "print(second_batch)"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -1077,12 +1185,12 @@
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "- We can also create batched outputs\n",
			
 
				-    "- Note that we increase the stride here so that we don't have overlaps between the batches, which could lead to increased overfitting"
			
 
				+    "- Note that we increase the stride here so that we don't have overlaps between the batches, since more overlap could lead to increased overfitting"
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 36,
			
 
				+   "execution_count": 41,
			
 
				    "id": "1916e7a6-f03d-4f09-91a6-d0bdbac5a58c",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
@@ -1149,7 +1257,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 37,
			
 
				+   "execution_count": 42,
			
 
				    "id": "15a6304c-9474-4470-b85d-3991a49fa653",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
@@ -1167,7 +1275,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 38,
			
 
				+   "execution_count": 43,
			
 
				    "id": "93cb2cee-9aa6-4bb8-8977-c65661d16eda",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
@@ -1189,29 +1297,26 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 39,
			
 
				+   "execution_count": 44,
			
 
				    "id": "a686eb61-e737-4351-8f1c-222913d47468",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
 
				     {
			
 
				-     "data": {
			
 
				-      "text/plain": [
			
 
				-       "Parameter containing:\n",
			
 
				-       "tensor([[ 0.3374, -0.1778, -0.1690],\n",
			
 
				-       "        [ 0.9178,  1.5810,  1.3010],\n",
			
 
				-       "        [ 1.2753, -0.2010, -0.1606],\n",
			
 
				-       "        [-0.4015,  0.9666, -1.1481],\n",
			
 
				-       "        [-1.1589,  0.3255, -0.6315],\n",
			
 
				-       "        [-2.8400, -0.7849, -1.4096]], requires_grad=True)"
			
 
				-      ]
			
 
				-     },
			
 
				-     "execution_count": 39,
			
 
				-     "metadata": {},
			
 
				-     "output_type": "execute_result"
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "Parameter containing:\n",
			
 
				+      "tensor([[ 0.3374, -0.1778, -0.1690],\n",
			
 
				+      "        [ 0.9178,  1.5810,  1.3010],\n",
			
 
				+      "        [ 1.2753, -0.2010, -0.1606],\n",
			
 
				+      "        [-0.4015,  0.9666, -1.1481],\n",
			
 
				+      "        [-1.1589,  0.3255, -0.6315],\n",
			
 
				+      "        [-2.8400, -0.7849, -1.4096]], requires_grad=True)\n"
			
 
				+     ]
			
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "embedding_layer.weight"
			
 
				+    "print(embedding_layer.weight)"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -1233,23 +1338,20 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 40,
			
 
				+   "execution_count": 45,
			
 
				    "id": "e43600ba-f287-4746-8ddf-d0f71a9023ca",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
 
				     {
			
 
				-     "data": {
			
 
				-      "text/plain": [
			
 
				-       "tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)"
			
 
				-      ]
			
 
				-     },
			
 
				-     "execution_count": 40,
			
 
				-     "metadata": {},
			
 
				-     "output_type": "execute_result"
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)\n"
			
 
				+     ]
			
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "embedding_layer(torch.tensor([3]))"
			
 
				+    "print(embedding_layer(torch.tensor([3])))"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -1263,47 +1365,23 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 41,
			
 
				+   "execution_count": 46,
			
 
				    "id": "50280ead-0363-44c8-8c35-bb885d92c8b7",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
 
				     {
			
 
				-     "data": {
			
 
				-      "text/plain": [
			
 
				-       "tensor([[-2.8400, -0.7849, -1.4096],\n",
			
 
				-       "        [ 0.9178,  1.5810,  1.3010],\n",
			
 
				-       "        [-0.4015,  0.9666, -1.1481],\n",
			
 
				-       "        [ 1.2753, -0.2010, -0.1606]], grad_fn=<EmbeddingBackward0>)"
			
 
				-      ]
			
 
				-     },
			
 
				-     "execution_count": 41,
			
 
				-     "metadata": {},
			
 
				-     "output_type": "execute_result"
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "tensor([[-2.8400, -0.7849, -1.4096],\n",
			
 
				+      "        [ 0.9178,  1.5810,  1.3010],\n",
			
 
				+      "        [-0.4015,  0.9666, -1.1481],\n",
			
 
				+      "        [ 1.2753, -0.2010, -0.1606]], grad_fn=<EmbeddingBackward0>)\n"
			
 
				+     ]
			
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "embedding_layer(input_ids)"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "53f452c4-5fcb-4528-8fda-fd1a16f26bc7",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "- The BytePair encoder has a vocabulary size of 50,257:"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": 42,
			
 
				-   "id": "91c1f77f-cb0c-4f72-a258-ec9bab2bc755",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [],
			
 
				-   "source": [
			
 
				-    "vocab_size = 50257\n",
			
 
				-    "output_dim = 256\n",
			
 
				-    "\n",
			
 
				-    "token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)"
			
 
				+    "print(embedding_layer(input_ids))"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -1319,12 +1397,13 @@
 
				    "id": "7f187f87-c1f8-4c2e-8050-350bbb972f55",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				+    "- The BytePair encoder has a vocabulary size of 50,257:\n",
			
 
				     "- Suppose we want to encode the input tokens into a 256-dimensional vector representation:"
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 43,
			
 
				+   "execution_count": 48,
			
 
				    "id": "0b9e344d-03a6-4f2c-b723-67b6a20c5041",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
@@ -1340,42 +1419,70 @@
 
				    "id": "a2654722-24e4-4b0d-a43c-436a461eb70b",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "- If we sample data from teh dataloader, we embed the tokens in each batch into a 256-dimensional vector\n",
			
 
				+    "- If we sample data from the dataloader, we embed the tokens in each batch into a 256-dimensional vector\n",
			
 
				     "- If we have a batch size of 8 with 4 tokens each, this results in a 8 x 4 x 256 tensor:"
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 44,
			
 
				+   "execution_count": 49,
			
 
				    "id": "ad56a263-3d2e-4d91-98bf-d0b68d3c7fc3",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "dataloader = create_dataloader(raw_text, batch_size=8, max_length=4, stride=5)\n",
			
 
				+    "max_length = 4\n",
			
 
				+    "dataloader = create_dataloader(raw_text, batch_size=8, max_length=max_length, stride=5)\n",
			
 
				     "data_iter = iter(dataloader)\n",
			
 
				     "inputs, targets = next(data_iter)"
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 45,
			
 
				+   "execution_count": 50,
			
 
				+   "id": "84416b60-3707-4370-bcbc-da0b62f2b64d",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "Token IDs:\n",
			
 
				+      " tensor([[   40,   367,  2885,  1464],\n",
			
 
				+      "        [ 3619,   402,   271, 10899],\n",
			
 
				+      "        [  257,  7026, 15632,   438],\n",
			
 
				+      "        [  257,   922,  5891,  1576],\n",
			
 
				+      "        [  568,   340,   373,   645],\n",
			
 
				+      "        [ 5975,   284,   502,   284],\n",
			
 
				+      "        [  326,    11,   287,   262],\n",
			
 
				+      "        [  286,   465, 13476,    11]])\n",
			
 
				+      "\n",
			
 
				+      "Inputs shape:\n",
			
 
				+      " torch.Size([8, 4])\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "print(\"Token IDs:\\n\", inputs)\n",
			
 
				+    "print(\"\\nInputs shape:\\n\", inputs.shape)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 51,
			
 
				    "id": "7766ec38-30d0-4128-8c31-f49f063c43d1",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
 
				     {
			
 
				-     "data": {
			
 
				-      "text/plain": [
			
 
				-       "torch.Size([8, 4, 256])"
			
 
				-      ]
			
 
				-     },
			
 
				-     "execution_count": 45,
			
 
				-     "metadata": {},
			
 
				-     "output_type": "execute_result"
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "torch.Size([8, 4, 256])\n"
			
 
				+     ]
			
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				     "token_embeddings = token_embedding_layer(inputs)\n",
			
 
				-    "token_embeddings.shape"
			
 
				+    "print(token_embeddings.shape)"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -1383,12 +1490,12 @@
 
				    "id": "fe2ae164-6f19-4e32-b9e5-76950fcf1c9f",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "- GPT2 uses absolute position embeddings, so we just create another embedding layer:"
			
 
				+    "- GPT-2 uses absolute position embeddings, so we just create another embedding layer:"
			
 
				    ]
			
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 46,
			
 
				+   "execution_count": 52,
			
 
				    "id": "cc048e20-7ac8-417e-81f5-8fe6f9a4fe07",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
@@ -1398,24 +1505,21 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 47,
			
 
				+   "execution_count": 53,
			
 
				    "id": "c369a1e7-d566-4b53-b398-d6adafb44105",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
 
				     {
			
 
				-     "data": {
			
 
				-      "text/plain": [
			
 
				-       "torch.Size([8, 4, 256])"
			
 
				-      ]
			
 
				-     },
			
 
				-     "execution_count": 47,
			
 
				-     "metadata": {},
			
 
				-     "output_type": "execute_result"
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "torch.Size([4, 256])\n"
			
 
				+     ]
			
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "pos_embeddings = pos_embedding_layer(inputs)\n",
			
 
				-    "pos_embeddings.shape"
			
 
				+    "pos_embeddings = pos_embedding_layer(torch.arange(max_length))\n",
			
 
				+    "print(pos_embeddings.shape)"
			
 
				    ]
			
 
				   },
			
 
				   {
			
@@ -1428,25 +1532,38 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 48,
			
 
				+   "execution_count": 54,
			
 
				    "id": "b22fab89-526e-43c8-9035-5b7018e34288",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
 
				     {
			
 
				-     "data": {
			
 
				-      "text/plain": [
			
 
				-       "torch.Size([8, 4, 256])"
			
 
				-      ]
			
 
				-     },
			
 
				-     "execution_count": 48,
			
 
				-     "metadata": {},
			
 
				-     "output_type": "execute_result"
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "torch.Size([8, 4, 256])\n"
			
 
				+     ]
			
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				     "input_embeddings = token_embeddings + pos_embeddings\n",
			
 
				-    "input_embeddings.shape"
			
 
				+    "print(input_embeddings.shape)"
			
 
				    ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "a6b71f61-57f4-496b-bf48-9097c591f54c",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": []
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "c2894bbd-6cf5-4bfa-80ad-a23b5d1a45f4",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": []
			
 
				   }
			
 
				  ],
			
 
				  "metadata": {
			
@@ -1465,7 +1582,7 @@
 
				    "name": "python",
			
 
				    "nbconvert_exporter": "python",
			
 
				    "pygments_lexer": "ipython3",
			
 
				-   "version": "3.10.6"
			
 
				+   "version": "3.10.12"
			
 
				   }
			
 
				  },
			
 
				  "nbformat": 4,
			
--- a/ch02/01_main-chapter-code/dataloader.ipynb
+++ b/ch02/01_main-chapter-code/dataloader.ipynb
@@ -0,0 +1,150 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "6f678e62-7bcb-4405-86ae-dce94f494303",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# The Main Data Loading Pipeline Summarized"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "070000fc-a7b7-4c56-a2c0-a938d413a790",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "The complete chapter code is located in [ch02.ipynb](./ch02.ipynb).\n",
			
 
				+    "\n",
			
 
				+    "This notebook contains the main takeaway, the data loading pipeline without the intermediate steps."
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 3,
			
 
				+   "id": "0ed4b7db-3b47-4fd3-a4a6-5f4ed5dd166e",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "import tiktoken\n",
			
 
				+    "import torch\n",
			
 
				+    "from torch.utils.data import Dataset, DataLoader\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "class GPTDatasetV1(Dataset):\n",
			
 
				+    "    def __init__(self, txt, tokenizer, max_length, stride):\n",
			
 
				+    "        self.tokenizer = tokenizer\n",
			
 
				+    "        self.input_ids = []\n",
			
 
				+    "        self.target_ids = []\n",
			
 
				+    "\n",
			
 
				+    "        # Tokenize the entire text\n",
			
 
				+    "        token_ids = tokenizer.encode(txt)\n",
			
 
				+    "\n",
			
 
				+    "        # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
			
 
				+    "        for i in range(0, len(token_ids) - max_length, stride):\n",
			
 
				+    "            input_chunk = token_ids[i:i + max_length]\n",
			
 
				+    "            target_chunk = token_ids[i + 1: i + max_length + 1]\n",
			
 
				+    "            self.input_ids.append(torch.tensor(input_chunk))\n",
			
 
				+    "            self.target_ids.append(torch.tensor(target_chunk))\n",
			
 
				+    "\n",
			
 
				+    "    def __len__(self):\n",
			
 
				+    "        return len(self.input_ids)\n",
			
 
				+    "\n",
			
 
				+    "    def __getitem__(self, idx):\n",
			
 
				+    "        return self.input_ids[idx], self.target_ids[idx]\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "def create_dataloader(txt, batch_size=4, max_length=256, stride=128):\n",
			
 
				+    "    # Initialize the tokenizer\n",
			
 
				+    "    tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
			
 
				+    "\n",
			
 
				+    "    # Create dataset\n",
			
 
				+    "    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)\n",
			
 
				+    "\n",
			
 
				+    "    # Create dataloader\n",
			
 
				+    "    dataloader = DataLoader(dataset, batch_size=batch_size)\n",
			
 
				+    "\n",
			
 
				+    "    return dataloader\n",
			
 
				+    "\n",
			
 
				+    "\n",
			
 
				+    "with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
			
 
				+    "    raw_text = f.read()\n",
			
 
				+    "\n",
			
 
				+    "tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
			
 
				+    "encoded_text = tokenizer.encode(raw_text)\n",
			
 
				+    "\n",
			
 
				+    "vocab_size = 50257\n",
			
 
				+    "output_dim = 256\n",
			
 
				+    "token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)\n",
			
 
				+    "pos_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)\n",
			
 
				+    "\n",
			
 
				+    "max_length = 4\n",
			
 
				+    "dataloader = create_dataloader(raw_text, batch_size=8, max_length=max_length, stride=5)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 4,
			
 
				+   "id": "664397bc-6daa-4b88-90aa-e8fc1fbd5846",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "for batch in dataloader:\n",
			
 
				+    "    x, y = batch\n",
			
 
				+    "\n",
			
 
				+    "    token_embeddings = token_embedding_layer(x)\n",
			
 
				+    "    pos_embeddings = pos_embedding_layer(torch.arange(max_length))\n",
			
 
				+    "\n",
			
 
				+    "    input_embeddings = token_embeddings + pos_embeddings\n",
			
 
				+    "\n",
			
 
				+    "    break"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 5,
			
 
				+   "id": "d3664332-e6bb-447e-8b96-203aafde8b24",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "torch.Size([8, 4, 256])\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "print(input_embeddings.shape)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "2773c09d-c136-4372-a2be-04b58d292842",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": []
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "Python 3 (ipykernel)",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.10.12"
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 5
			
 
				+}
			
--- a/ch02/03_bonus_embedding-vs-matmul/embeddings-and-linear-layers.ipynb
+++ b/ch02/03_bonus_embedding-vs-matmul/embeddings-and-linear-layers.ipynb
@@ -478,7 +478,7 @@
 
				    "name": "python",
			
 
				    "nbconvert_exporter": "python",
			
 
				    "pygments_lexer": "ipython3",
			
 
				-   "version": "3.10.6"
			
 
				+   "version": "3.10.12"
			
 
				   }
			
 
				  },
			
 
				  "nbformat": 4,
			
--- a/ch02/Untitled.ipynb
+++ b/ch02/Untitled.ipynb
@@ -0,0 +1,117 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 4,
			
 
				+   "id": "98efe79e-daa3-40d0-ab4d-f667d4d6ba9d",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stderr",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "/Users/Author/miniforge3/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
			
 
				+      "  from .autonotebook import tqdm as notebook_tqdm\n",
			
 
				+      "Downloading (…)olve/main/vocab.json: 100%|█| 1.04M/1.04M [00:00<00:00, 1.66MB/s]\n",
			
 
				+      "Downloading (…)olve/main/merges.txt: 100%|███| 456k/456k [00:00<00:00, 2.44MB/s]\n",
			
 
				+      "Downloading (…)/main/tokenizer.json: 100%|█| 1.36M/1.36M [00:00<00:00, 1.97MB/s]\n",
			
 
				+      "Downloading (…)lve/main/config.json: 100%|██████| 718/718 [00:00<00:00, 621kB/s]\n"
			
 
				+     ]
			
 
				+    },
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "Decoded Inputs:\n",
			
 
				+      "I HAD always\n",
			
 
				+      " Jack Gisburn\n",
			
 
				+      " a cheap genius--\n",
			
 
				+      " a good fellow enough\n",
			
 
				+      "so it was no\n",
			
 
				+      " surprise to me to\n",
			
 
				+      " that, in the\n",
			
 
				+      " of his glory,\n",
			
 
				+      "\n",
			
 
				+      "Decoded Targets:\n",
			
 
				+      " HAD always thought\n",
			
 
				+      " Gisburn rather\n",
			
 
				+      " cheap genius--though\n",
			
 
				+      " good fellow enough--\n",
			
 
				+      " it was no great\n",
			
 
				+      " to me to hear\n",
			
 
				+      ", in the height\n",
			
 
				+      " his glory, he\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "import torch\n",
			
 
				+    "from transformers import GPT2Tokenizer\n",
			
 
				+    "\n",
			
 
				+    "tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')\n",
			
 
				+    "\n",
			
 
				+    "inputs = torch.tensor([\n",
			
 
				+    "    [40, 367, 2885, 1464],\n",
			
 
				+    "    [3619, 402, 271, 10899],\n",
			
 
				+    "    [257, 7026, 15632, 438],\n",
			
 
				+    "    [257, 922, 5891, 1576],\n",
			
 
				+    "    [568, 340, 373, 645],\n",
			
 
				+    "    [5975, 284, 502, 284],\n",
			
 
				+    "    [326, 11, 287, 262],\n",
			
 
				+    "    [286, 465, 13476, 11]\n",
			
 
				+    "])\n",
			
 
				+    "\n",
			
 
				+    "targets = torch.tensor([\n",
			
 
				+    "    [367, 2885, 1464, 1807],\n",
			
 
				+    "    [402, 271, 10899, 2138],\n",
			
 
				+    "    [7026, 15632, 438, 2016],\n",
			
 
				+    "    [922, 5891, 1576, 438],\n",
			
 
				+    "    [340, 373, 645, 1049],\n",
			
 
				+    "    [284, 502, 284, 3285],\n",
			
 
				+    "    [11, 287, 262, 6001],\n",
			
 
				+    "    [465, 13476, 11, 339]\n",
			
 
				+    "])\n",
			
 
				+    "\n",
			
 
				+    "decoded_inputs = [tokenizer.decode(i) for i in inputs]\n",
			
 
				+    "decoded_targets = [tokenizer.decode(t) for t in targets]\n",
			
 
				+    "\n",
			
 
				+    "print(\"Decoded Inputs:\")\n",
			
 
				+    "for di in decoded_inputs:\n",
			
 
				+    "    print(di)\n",
			
 
				+    "\n",
			
 
				+    "print(\"\\nDecoded Targets:\")\n",
			
 
				+    "for dt in decoded_targets:\n",
			
 
				+    "    print(dt)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "defc6b2f-9ac2-49e0-a4e1-03247cacffce",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": []
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "Python 3 (ipykernel)",
			
 
				+   "language": "python",
			
 
				+   "name": "python3"
			
 
				+  },
			
 
				+  "language_info": {
			
 
				+   "codemirror_mode": {
			
 
				+    "name": "ipython",
			
 
				+    "version": 3
			
 
				+   },
			
 
				+   "file_extension": ".py",
			
 
				+   "mimetype": "text/x-python",
			
 
				+   "name": "python",
			
 
				+   "nbconvert_exporter": "python",
			
 
				+   "pygments_lexer": "ipython3",
			
 
				+   "version": "3.10.12"
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 5
			
 
				+}