浏览代码

cleanup and minimal notebook

Sebastian R 2 年之前
父节点
当前提交
ab1261d9b1

+ 260 - 143
ch02/01_main-chapter-code/ch02.ipynb

@@ -548,7 +548,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 16,
    "id": "57c3143b-e860-4d3b-a22a-de22b547a6a9",
    "metadata": {},
    "outputs": [
@@ -558,7 +558,7 @@
        "1161"
       ]
      },
-     "execution_count": 17,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -569,7 +569,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 17,
    "id": "50e51bb1-ae05-4aa8-a9ff-455b65ed1959",
    "metadata": {},
    "outputs": [
@@ -600,7 +600,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 18,
    "id": "948861c5-3f30-4712-a234-725f20d26f68",
    "metadata": {},
    "outputs": [],
@@ -636,32 +636,68 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 19,
    "id": "effcef79-e0a5-4f4a-a43a-31dd94b9250a",
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.\n"
+     ]
+    }
+   ],
+   "source": [
+    "tokenizer = SimpleTokenizerV2(vocab)\n",
+    "\n",
+    "text1 = \"Hello, do you like tea?\"\n",
+    "text2 = \"In the sunlit terraces of the palace.\"\n",
+    "\n",
+    "text = \" <|endoftext|> \".join((text1, text2))\n",
+    "\n",
+    "print(text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "ddfe7346-398d-4bf8-99f1-5b071244ce95",
+   "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "[1, 7, 364, 1157, 644, 1002, 12, 0, 59, 1015, 983, 1011, 740, 1015, 1, 9]"
+       "[1160,\n",
+       " 5,\n",
+       " 362,\n",
+       " 1155,\n",
+       " 642,\n",
+       " 1000,\n",
+       " 10,\n",
+       " 1159,\n",
+       " 57,\n",
+       " 1013,\n",
+       " 981,\n",
+       " 1009,\n",
+       " 738,\n",
+       " 1013,\n",
+       " 1160,\n",
+       " 7]"
       ]
      },
-     "execution_count": 18,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "tokenizer = SimpleTokenizerV2(vocab)\n",
-    "\n",
-    "text = \"Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.\"\n",
-    "\n",
     "tokenizer.encode(text)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 21,
    "id": "0c350ff6-2734-4e84-9ec7-d578baa4ae1b",
    "metadata": {},
    "outputs": [
@@ -671,7 +707,7 @@
        "'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -703,7 +739,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 22,
    "id": "ede1d41f-934b-4bf4-8184-54394a257a94",
    "metadata": {},
    "outputs": [],
@@ -713,7 +749,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 23,
    "id": "48967a77-7d17-42bf-9e92-fc619d63a59e",
    "metadata": {},
    "outputs": [
@@ -734,7 +770,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 24,
    "id": "6ad3312f-a5f7-4efc-9d7d-8ea09d7b5128",
    "metadata": {},
    "outputs": [],
@@ -744,7 +780,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 25,
    "id": "5ff2cd85-7cfb-4325-b390-219938589428",
    "metadata": {},
    "outputs": [
@@ -766,7 +802,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 26,
    "id": "d26a48bb-f82e-41a8-a955-a1c9cf9d50ab",
    "metadata": {},
    "outputs": [
@@ -784,6 +820,76 @@
     "print(strings)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "f63d62ab-4b80-489c-8041-e4052fe29969",
+   "metadata": {},
+   "source": [
+    "- Experiments with unknown words:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "ce25cf25-a2bb-44d2-bac1-cb566f433f98",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[33901, 86, 343, 86, 220, 959]\n"
+     ]
+    }
+   ],
+   "source": [
+    "integers = tokenizer.encode(\"Akwirw ier\")\n",
+    "print(integers)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "3e224f96-41d0-4074-ac6e-f7db2490f806",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "33901 -> Ak\n",
+      "86 -> w\n",
+      "343 -> ir\n",
+      "86 -> w\n",
+      "220 ->  \n",
+      "959 -> ier\n"
+     ]
+    }
+   ],
+   "source": [
+    "for i in integers:\n",
+    "    print(f\"{i} -> {tokenizer.decode([i])}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "766bcf29-64bf-47ca-9b65-4ae8e607d580",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Akwirw ier\n"
+     ]
+    }
+   ],
+   "source": [
+    "strings = tokenizer.decode(integers)\n",
+    "print(strings)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "abbd7c0d-70f8-4386-a114-907e96c950b0",
@@ -794,7 +900,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 30,
    "id": "848d5ade-fd1f-46c3-9e31-1426e315c71b",
    "metadata": {},
    "outputs": [
@@ -807,10 +913,10 @@
     }
    ],
    "source": [
-    "with open('the-verdict.txt', 'r', encoding='utf-8') as f:\n",
+    "with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
     "    raw_text = f.read()\n",
     "\n",
-    "enc_text = tokenizer.encode(raw_text, allowed_special={\"<|endoftext|>\"})\n",
+    "enc_text = tokenizer.encode(raw_text)\n",
     "print(len(enc_text))"
    ]
   },
@@ -825,7 +931,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 31,
    "id": "e84424a7-646d-45b6-99e3-80d15fb761f2",
    "metadata": {},
    "outputs": [],
@@ -835,7 +941,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 32,
    "id": "dfbff852-a92f-48c8-a46d-143a0f109f40",
    "metadata": {},
    "outputs": [
@@ -868,7 +974,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 33,
    "id": "d97b031e-ed55-409d-95f2-aeb38c6fe366",
    "metadata": {},
    "outputs": [
@@ -878,12 +984,13 @@
      "text": [
       "[290] ----> 4920\n",
       "[290, 4920] ----> 2241\n",
-      "[290, 4920, 2241] ----> 287\n"
+      "[290, 4920, 2241] ----> 287\n",
+      "[290, 4920, 2241, 287] ----> 257\n"
      ]
     }
    ],
    "source": [
-    "for i in range(1, context_size):\n",
+    "for i in range(1, context_size+1):\n",
     "    context = enc_sample[:i]\n",
     "    desired = enc_sample[i]\n",
     "\n",
@@ -892,7 +999,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 34,
    "id": "f57bd746-dcbf-4433-8e24-ee213a8c34a1",
    "metadata": {},
    "outputs": [
@@ -902,12 +1009,13 @@
      "text": [
       " and ---->  established\n",
       " and established ---->  himself\n",
-      " and established himself ---->  in\n"
+      " and established himself ---->  in\n",
+      " and established himself in ---->  a\n"
      ]
     }
    ],
    "source": [
-    "for i in range(1, context_size):\n",
+    "for i in range(1, context_size+1):\n",
     "    context = enc_sample[:i]\n",
     "    desired = enc_sample[i]\n",
     "\n",
@@ -933,7 +1041,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 35,
    "id": "e1770134-e7f3-4725-a679-e04c3be48cac",
    "metadata": {},
    "outputs": [
@@ -941,7 +1049,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "PyTorch version: 2.0.1\n"
+      "PyTorch version: 2.1.0\n"
      ]
     }
    ],
@@ -960,7 +1068,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 36,
    "id": "74b41073-4c9f-46e2-a1bd-d38e4122b375",
    "metadata": {},
    "outputs": [],
@@ -993,7 +1101,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 37,
    "id": "5eb30ebe-97b3-43c5-9ff1-a97d621b3c4e",
    "metadata": {},
    "outputs": [],
@@ -1021,18 +1129,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 38,
    "id": "df31d96c-6bfd-4564-a956-6192242d7579",
    "metadata": {},
    "outputs": [],
    "source": [
-    "with open('the-verdict.txt', 'r', encoding='utf-8') as f:\n",
+    "with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
     "    raw_text = f.read()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 39,
    "id": "9226d00c-ad9a-4949-a6e4-9afccfc7214f",
    "metadata": {},
    "outputs": [
@@ -1048,13 +1156,13 @@
     "dataloader = create_dataloader(raw_text, batch_size=1, max_length=4, stride=1)\n",
     "\n",
     "data_iter = iter(dataloader)\n",
-    "next_batch = next(data_iter)\n",
-    "print(next_batch)"
+    "first_batch = next(data_iter)\n",
+    "print(first_batch)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 40,
    "id": "10deb4bc-4de1-4d20-921e-4b1c7a0e1a6d",
    "metadata": {},
    "outputs": [
@@ -1067,8 +1175,8 @@
     }
    ],
    "source": [
-    "next_batch = next(data_iter)\n",
-    "print(next_batch)"
+    "second_batch = next(data_iter)\n",
+    "print(second_batch)"
    ]
   },
   {
@@ -1077,12 +1185,12 @@
    "metadata": {},
    "source": [
     "- We can also create batched outputs\n",
-    "- Note that we increase the stride here so that we don't have overlaps between the batches, which could lead to increased overfitting"
+    "- Note that we increase the stride here so that we don't have overlaps between the batches, since more overlap could lead to increased overfitting"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 41,
    "id": "1916e7a6-f03d-4f09-91a6-d0bdbac5a58c",
    "metadata": {},
    "outputs": [
@@ -1149,7 +1257,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 42,
    "id": "15a6304c-9474-4470-b85d-3991a49fa653",
    "metadata": {},
    "outputs": [],
@@ -1167,7 +1275,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 43,
    "id": "93cb2cee-9aa6-4bb8-8977-c65661d16eda",
    "metadata": {},
    "outputs": [],
@@ -1189,29 +1297,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 44,
    "id": "a686eb61-e737-4351-8f1c-222913d47468",
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "Parameter containing:\n",
-       "tensor([[ 0.3374, -0.1778, -0.1690],\n",
-       "        [ 0.9178,  1.5810,  1.3010],\n",
-       "        [ 1.2753, -0.2010, -0.1606],\n",
-       "        [-0.4015,  0.9666, -1.1481],\n",
-       "        [-1.1589,  0.3255, -0.6315],\n",
-       "        [-2.8400, -0.7849, -1.4096]], requires_grad=True)"
-      ]
-     },
-     "execution_count": 39,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Parameter containing:\n",
+      "tensor([[ 0.3374, -0.1778, -0.1690],\n",
+      "        [ 0.9178,  1.5810,  1.3010],\n",
+      "        [ 1.2753, -0.2010, -0.1606],\n",
+      "        [-0.4015,  0.9666, -1.1481],\n",
+      "        [-1.1589,  0.3255, -0.6315],\n",
+      "        [-2.8400, -0.7849, -1.4096]], requires_grad=True)\n"
+     ]
     }
    ],
    "source": [
-    "embedding_layer.weight"
+    "print(embedding_layer.weight)"
    ]
   },
   {
@@ -1233,23 +1338,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 45,
    "id": "e43600ba-f287-4746-8ddf-d0f71a9023ca",
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)"
-      ]
-     },
-     "execution_count": 40,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)\n"
+     ]
     }
    ],
    "source": [
-    "embedding_layer(torch.tensor([3]))"
+    "print(embedding_layer(torch.tensor([3])))"
    ]
   },
   {
@@ -1263,47 +1365,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 46,
    "id": "50280ead-0363-44c8-8c35-bb885d92c8b7",
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "tensor([[-2.8400, -0.7849, -1.4096],\n",
-       "        [ 0.9178,  1.5810,  1.3010],\n",
-       "        [-0.4015,  0.9666, -1.1481],\n",
-       "        [ 1.2753, -0.2010, -0.1606]], grad_fn=<EmbeddingBackward0>)"
-      ]
-     },
-     "execution_count": 41,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([[-2.8400, -0.7849, -1.4096],\n",
+      "        [ 0.9178,  1.5810,  1.3010],\n",
+      "        [-0.4015,  0.9666, -1.1481],\n",
+      "        [ 1.2753, -0.2010, -0.1606]], grad_fn=<EmbeddingBackward0>)\n"
+     ]
     }
    ],
    "source": [
-    "embedding_layer(input_ids)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "53f452c4-5fcb-4528-8fda-fd1a16f26bc7",
-   "metadata": {},
-   "source": [
-    "- The BytePair encoder has a vocabulary size of 50,257:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 42,
-   "id": "91c1f77f-cb0c-4f72-a258-ec9bab2bc755",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "vocab_size = 50257\n",
-    "output_dim = 256\n",
-    "\n",
-    "token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)"
+    "print(embedding_layer(input_ids))"
    ]
   },
   {
@@ -1319,12 +1397,13 @@
    "id": "7f187f87-c1f8-4c2e-8050-350bbb972f55",
    "metadata": {},
    "source": [
+    "- The BytePair encoder has a vocabulary size of 50,257:\n",
     "- Suppose we want to encode the input tokens into a 256-dimensional vector representation:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 48,
    "id": "0b9e344d-03a6-4f2c-b723-67b6a20c5041",
    "metadata": {},
    "outputs": [],
@@ -1340,42 +1419,70 @@
    "id": "a2654722-24e4-4b0d-a43c-436a461eb70b",
    "metadata": {},
    "source": [
-    "- If we sample data from teh dataloader, we embed the tokens in each batch into a 256-dimensional vector\n",
+    "- If we sample data from the dataloader, we embed the tokens in each batch into a 256-dimensional vector\n",
     "- If we have a batch size of 8 with 4 tokens each, this results in a 8 x 4 x 256 tensor:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 49,
    "id": "ad56a263-3d2e-4d91-98bf-d0b68d3c7fc3",
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataloader = create_dataloader(raw_text, batch_size=8, max_length=4, stride=5)\n",
+    "max_length = 4\n",
+    "dataloader = create_dataloader(raw_text, batch_size=8, max_length=max_length, stride=5)\n",
     "data_iter = iter(dataloader)\n",
     "inputs, targets = next(data_iter)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 50,
+   "id": "84416b60-3707-4370-bcbc-da0b62f2b64d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Token IDs:\n",
+      " tensor([[   40,   367,  2885,  1464],\n",
+      "        [ 3619,   402,   271, 10899],\n",
+      "        [  257,  7026, 15632,   438],\n",
+      "        [  257,   922,  5891,  1576],\n",
+      "        [  568,   340,   373,   645],\n",
+      "        [ 5975,   284,   502,   284],\n",
+      "        [  326,    11,   287,   262],\n",
+      "        [  286,   465, 13476,    11]])\n",
+      "\n",
+      "Inputs shape:\n",
+      " torch.Size([8, 4])\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Token IDs:\\n\", inputs)\n",
+    "print(\"\\nInputs shape:\\n\", inputs.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
    "id": "7766ec38-30d0-4128-8c31-f49f063c43d1",
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "torch.Size([8, 4, 256])"
-      ]
-     },
-     "execution_count": 45,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([8, 4, 256])\n"
+     ]
     }
    ],
    "source": [
     "token_embeddings = token_embedding_layer(inputs)\n",
-    "token_embeddings.shape"
+    "print(token_embeddings.shape)"
    ]
   },
   {
@@ -1383,12 +1490,12 @@
    "id": "fe2ae164-6f19-4e32-b9e5-76950fcf1c9f",
    "metadata": {},
    "source": [
-    "- GPT2 uses absolute position embeddings, so we just create another embedding layer:"
+    "- GPT-2 uses absolute position embeddings, so we just create another embedding layer:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
+   "execution_count": 52,
    "id": "cc048e20-7ac8-417e-81f5-8fe6f9a4fe07",
    "metadata": {},
    "outputs": [],
@@ -1398,24 +1505,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 53,
    "id": "c369a1e7-d566-4b53-b398-d6adafb44105",
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "torch.Size([8, 4, 256])"
-      ]
-     },
-     "execution_count": 47,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([4, 256])\n"
+     ]
     }
    ],
    "source": [
-    "pos_embeddings = pos_embedding_layer(inputs)\n",
-    "pos_embeddings.shape"
+    "pos_embeddings = pos_embedding_layer(torch.arange(max_length))\n",
+    "print(pos_embeddings.shape)"
    ]
   },
   {
@@ -1428,25 +1532,38 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": 54,
    "id": "b22fab89-526e-43c8-9035-5b7018e34288",
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "torch.Size([8, 4, 256])"
-      ]
-     },
-     "execution_count": 48,
-     "metadata": {},
-     "output_type": "execute_result"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([8, 4, 256])\n"
+     ]
     }
    ],
    "source": [
     "input_embeddings = token_embeddings + pos_embeddings\n",
-    "input_embeddings.shape"
+    "print(input_embeddings.shape)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a6b71f61-57f4-496b-bf48-9097c591f54c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c2894bbd-6cf5-4bfa-80ad-a23b5d1a45f4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -1465,7 +1582,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,

+ 150 - 0
ch02/01_main-chapter-code/dataloader.ipynb

@@ -0,0 +1,150 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "6f678e62-7bcb-4405-86ae-dce94f494303",
+   "metadata": {},
+   "source": [
+    "# The Main Data Loading Pipeline Summarized"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "070000fc-a7b7-4c56-a2c0-a938d413a790",
+   "metadata": {},
+   "source": [
+    "The complete chapter code is located in [ch02.ipynb](./ch02.ipynb).\n",
+    "\n",
+    "This notebook contains the main takeaway, the data loading pipeline without the intermediate steps."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "0ed4b7db-3b47-4fd3-a4a6-5f4ed5dd166e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tiktoken\n",
+    "import torch\n",
+    "from torch.utils.data import Dataset, DataLoader\n",
+    "\n",
+    "\n",
+    "class GPTDatasetV1(Dataset):\n",
+    "    def __init__(self, txt, tokenizer, max_length, stride):\n",
+    "        self.tokenizer = tokenizer\n",
+    "        self.input_ids = []\n",
+    "        self.target_ids = []\n",
+    "\n",
+    "        # Tokenize the entire text\n",
+    "        token_ids = tokenizer.encode(txt)\n",
+    "\n",
+    "        # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
+    "        for i in range(0, len(token_ids) - max_length, stride):\n",
+    "            input_chunk = token_ids[i:i + max_length]\n",
+    "            target_chunk = token_ids[i + 1: i + max_length + 1]\n",
+    "            self.input_ids.append(torch.tensor(input_chunk))\n",
+    "            self.target_ids.append(torch.tensor(target_chunk))\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.input_ids)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        return self.input_ids[idx], self.target_ids[idx]\n",
+    "\n",
+    "\n",
+    "def create_dataloader(txt, batch_size=4, max_length=256, stride=128):\n",
+    "    # Initialize the tokenizer\n",
+    "    tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
+    "\n",
+    "    # Create dataset\n",
+    "    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)\n",
+    "\n",
+    "    # Create dataloader\n",
+    "    dataloader = DataLoader(dataset, batch_size=batch_size)\n",
+    "\n",
+    "    return dataloader\n",
+    "\n",
+    "\n",
+    "with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
+    "    raw_text = f.read()\n",
+    "\n",
+    "tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
+    "encoded_text = tokenizer.encode(raw_text)\n",
+    "\n",
+    "vocab_size = 50257\n",
+    "output_dim = 256\n",
+    "token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)\n",
+    "pos_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)\n",
+    "\n",
+    "max_length = 4\n",
+    "dataloader = create_dataloader(raw_text, batch_size=8, max_length=max_length, stride=5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "664397bc-6daa-4b88-90aa-e8fc1fbd5846",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for batch in dataloader:\n",
+    "    x, y = batch\n",
+    "\n",
+    "    token_embeddings = token_embedding_layer(x)\n",
+    "    pos_embeddings = pos_embedding_layer(torch.arange(max_length))\n",
+    "\n",
+    "    input_embeddings = token_embeddings + pos_embeddings\n",
+    "\n",
+    "    break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "d3664332-e6bb-447e-8b96-203aafde8b24",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([8, 4, 256])\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(input_embeddings.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2773c09d-c136-4372-a2be-04b58d292842",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

+ 1 - 1
ch02/03_bonus_embedding-vs-matmul/embeddings-and-linear-layers.ipynb

@@ -478,7 +478,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,

+ 117 - 0
ch02/Untitled.ipynb

@@ -0,0 +1,117 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "98efe79e-daa3-40d0-ab4d-f667d4d6ba9d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/Author/miniforge3/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "Downloading (…)olve/main/vocab.json: 100%|█| 1.04M/1.04M [00:00<00:00, 1.66MB/s]\n",
+      "Downloading (…)olve/main/merges.txt: 100%|███| 456k/456k [00:00<00:00, 2.44MB/s]\n",
+      "Downloading (…)/main/tokenizer.json: 100%|█| 1.36M/1.36M [00:00<00:00, 1.97MB/s]\n",
+      "Downloading (…)lve/main/config.json: 100%|██████| 718/718 [00:00<00:00, 621kB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Decoded Inputs:\n",
+      "I HAD always\n",
+      " Jack Gisburn\n",
+      " a cheap genius--\n",
+      " a good fellow enough\n",
+      "so it was no\n",
+      " surprise to me to\n",
+      " that, in the\n",
+      " of his glory,\n",
+      "\n",
+      "Decoded Targets:\n",
+      " HAD always thought\n",
+      " Gisburn rather\n",
+      " cheap genius--though\n",
+      " good fellow enough--\n",
+      " it was no great\n",
+      " to me to hear\n",
+      ", in the height\n",
+      " his glory, he\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from transformers import GPT2Tokenizer\n",
+    "\n",
+    "tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')\n",
+    "\n",
+    "inputs = torch.tensor([\n",
+    "    [40, 367, 2885, 1464],\n",
+    "    [3619, 402, 271, 10899],\n",
+    "    [257, 7026, 15632, 438],\n",
+    "    [257, 922, 5891, 1576],\n",
+    "    [568, 340, 373, 645],\n",
+    "    [5975, 284, 502, 284],\n",
+    "    [326, 11, 287, 262],\n",
+    "    [286, 465, 13476, 11]\n",
+    "])\n",
+    "\n",
+    "targets = torch.tensor([\n",
+    "    [367, 2885, 1464, 1807],\n",
+    "    [402, 271, 10899, 2138],\n",
+    "    [7026, 15632, 438, 2016],\n",
+    "    [922, 5891, 1576, 438],\n",
+    "    [340, 373, 645, 1049],\n",
+    "    [284, 502, 284, 3285],\n",
+    "    [11, 287, 262, 6001],\n",
+    "    [465, 13476, 11, 339]\n",
+    "])\n",
+    "\n",
+    "decoded_inputs = [tokenizer.decode(i) for i in inputs]\n",
+    "decoded_targets = [tokenizer.decode(t) for t in targets]\n",
+    "\n",
+    "print(\"Decoded Inputs:\")\n",
+    "for di in decoded_inputs:\n",
+    "    print(di)\n",
+    "\n",
+    "print(\"\\nDecoded Targets:\")\n",
+    "for dt in decoded_targets:\n",
+    "    print(dt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "defc6b2f-9ac2-49e0-a4e1-03247cacffce",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}