před 1 rokem · 3c5b288ca0
--- a/ch05/01_main-chapter-code/ch05.ipynb
+++ b/ch05/01_main-chapter-code/ch05.ipynb
@@ -23,7 +23,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 1,
			
 
				+   "execution_count": 2,
			
 
				    "id": "92b989e9-da36-4159-b212-799184764dd9",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
@@ -119,7 +119,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 2,
			
 
				+   "execution_count": 3,
			
 
				    "id": "86000d74-624a-48f0-86da-f41926cb9e04",
			
 
				    "metadata": {
			
 
				     "colab": {
			
@@ -180,7 +180,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 3,
			
 
				+   "execution_count": 4,
			
 
				    "id": "5e062b82-3540-48ce-8eb4-009686d0d16c",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
@@ -260,7 +260,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 4,
			
 
				+   "execution_count": 5,
			
 
				    "id": "6b5402f8-ec0c-4a44-9892-18a97779ee4f",
			
 
				    "metadata": {
			
 
				     "colab": {
			
@@ -290,7 +290,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 5,
			
 
				+   "execution_count": 6,
			
 
				    "id": "e7b6ec51-6f8c-49bd-a349-95ba38b46fb6",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
@@ -345,7 +345,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 6,
			
 
				+   "execution_count": 7,
			
 
				    "id": "34ebd76a-16ec-4c17-8958-8a135735cc1c",
			
 
				    "metadata": {
			
 
				     "colab": {
			
@@ -385,7 +385,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 7,
			
 
				+   "execution_count": 8,
			
 
				    "id": "c990ead6-53cd-49a7-a6d1-14d8c1518249",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
@@ -430,7 +430,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 8,
			
 
				+   "execution_count": 9,
			
 
				    "id": "54aef09c-d6e3-4238-8653-b3a1b0a1077a",
			
 
				    "metadata": {
			
 
				     "colab": {
			
@@ -470,7 +470,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 9,
			
 
				+   "execution_count": 10,
			
 
				    "id": "31402a67-a16e-4aeb-977e-70abb9c9949b",
			
 
				    "metadata": {
			
 
				     "colab": {
			
@@ -504,7 +504,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 10,
			
 
				+   "execution_count": 11,
			
 
				    "id": "9b003797-161b-4d98-81dc-e68320e09fec",
			
 
				    "metadata": {
			
 
				     "colab": {
			
@@ -548,7 +548,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 11,
			
 
				+   "execution_count": 12,
			
 
				    "id": "176ddf35-1c5f-4d7c-bf17-70f3e7069bd4",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
@@ -591,7 +591,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 12,
			
 
				+   "execution_count": 13,
			
 
				    "id": "695d6f64-5084-4c23-aea4-105c9e38cfe4",
			
 
				    "metadata": {
			
 
				     "colab": {
			
@@ -628,7 +628,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 13,
			
 
				+   "execution_count": 14,
			
 
				    "id": "0e17e027-ab9f-4fb5-ac9b-a009b831c122",
			
 
				    "metadata": {
			
 
				     "colab": {
			
@@ -666,7 +666,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 14,
			
 
				+   "execution_count": 15,
			
 
				    "id": "62d0816e-b29a-4c8f-a9a5-a167562de978",
			
 
				    "metadata": {
			
 
				     "colab": {
			
@@ -700,7 +700,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 15,
			
 
				+   "execution_count": 16,
			
 
				    "id": "168952a1-b964-4aa7-8e49-966fa26add54",
			
 
				    "metadata": {
			
 
				     "colab": {
			
@@ -764,7 +764,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 16,
			
 
				+   "execution_count": 17,
			
 
				    "id": "654fde37-b2a9-4a20-a8d3-0206c056e2ff",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
@@ -795,7 +795,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 17,
			
 
				+   "execution_count": 18,
			
 
				    "id": "6kgJbe4ehI4q",
			
 
				    "metadata": {
			
 
				     "colab": {
			
@@ -821,7 +821,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 18,
			
 
				+   "execution_count": 19,
			
 
				    "id": "j2XPde_ThM_e",
			
 
				    "metadata": {
			
 
				     "colab": {
			
@@ -847,7 +847,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 19,
			
 
				+   "execution_count": 20,
			
 
				    "id": "6b46a952-d50a-4837-af09-4095698f7fd1",
			
 
				    "metadata": {
			
 
				     "colab": {
			
@@ -903,7 +903,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 20,
			
 
				+   "execution_count": 21,
			
 
				    "id": "0959c855-f860-4358-8b98-bc654f047578",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
@@ -940,7 +940,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 21,
			
 
				+   "execution_count": 22,
			
 
				    "id": "f37b3eb0-854e-4895-9898-fa7d1e67566e",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
@@ -977,7 +977,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 22,
			
 
				+   "execution_count": 23,
			
 
				    "id": "ca0116d0-d229-472c-9fbf-ebc229331c3e",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
@@ -1021,7 +1021,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 23,
			
 
				+   "execution_count": 24,
			
 
				    "id": "eb860488-5453-41d7-9870-23b723f742a0",
			
 
				    "metadata": {
			
 
				     "colab": {
			
@@ -1066,7 +1066,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 24,
			
 
				+   "execution_count": 25,
			
 
				    "id": "7b9de31e-4096-47b3-976d-b6d2fdce04bc",
			
 
				    "metadata": {
			
 
				     "id": "7b9de31e-4096-47b3-976d-b6d2fdce04bc"
			
@@ -1110,7 +1110,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 25,
			
 
				+   "execution_count": 26,
			
 
				    "id": "56f5b0c9-1065-4d67-98b9-010e42fc1e2a",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
@@ -1178,8 +1178,7 @@
 
				     "                       eval_freq, eval_iter, start_context):\n",
			
 
				     "    # Initialize lists to track losses and tokens seen\n",
			
 
				     "    train_losses, val_losses, track_tokens_seen = [], [], []\n",
			
 
				-    "    tokens_seen = 0\n",
			
 
				-    "    global_step = -1\n",
			
 
				+    "    tokens_seen, global_step = 0, -1\n",
			
 
				     "\n",
			
 
				     "    # Main training loop\n",
			
 
				     "    for epoch in range(num_epochs):\n",
			
@@ -1408,7 +1407,7 @@
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "- Inference is relatively cheap with a relatively small LLM as the GPT model we trained above, so there's no need to use a GPU for it in case you used a GPU for training it above\n",
			
 
				-    "- Using the `generate_text_simple method` (from the previous chapter) that we used earlier inside the simple training function, we can generate new text one word (or token) at a time\n",
			
 
				+    "- Using the `generate_text_simple` function (from the previous chapter) that we used earlier inside the simple training function, we can generate new text one word (or token) at a time\n",
			
 
				     "- As explained in section 5.1.2, the next generated token is the token corresponding to the largest probability score among all tokens in the vocabulary"
			
 
				    ]
			
 
				   },
			
@@ -1498,8 +1497,6 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				-    "# Assume some logits from a neural network output for 7 vocabulary tokens\n",
			
 
				-    "\n",
			
 
				     "vocab = { \n",
			
 
				     "    \"closer\": 0,\n",
			
 
				     "    \"every\": 1, \n",
			
@@ -1527,12 +1524,74 @@
 
				     "print(inverse_vocab[next_token_id])"
			
 
				    ]
			
 
				   },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 36,
			
 
				+   "id": "6400572f-b3c8-49e2-95bc-433e55c5b3a1",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "forward\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "torch.manual_seed(123)\n",
			
 
				+    "next_token_id = torch.multinomial(probas, num_samples=1).item()\n",
			
 
				+    "print(inverse_vocab[next_token_id])"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 32,
			
 
				+   "id": "b23b863e-252a-403c-b5b1-62bc0a42319f",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [
			
 
				+    {
			
 
				+     "name": "stdout",
			
 
				+     "output_type": "stream",
			
 
				+     "text": [
			
 
				+      "73 x closer\n",
			
 
				+      "0 x every\n",
			
 
				+      "0 x effort\n",
			
 
				+      "582 x forward\n",
			
 
				+      "2 x inches\n",
			
 
				+      "0 x moves\n",
			
 
				+      "0 x pizza\n",
			
 
				+      "343 x toward\n"
			
 
				+     ]
			
 
				+    }
			
 
				+   ],
			
 
				+   "source": [
			
 
				+    "def print_sampled_tokens(probas):\n",
			
 
				+    "    torch.manual_seed(123) # Manual seed for reproducibility\n",
			
 
				+    "    sample = [torch.multinomial(probas, num_samples=1).item() for i in range(1_000)]\n",
			
 
				+    "    sampled_ids = torch.bincount(torch.tensor(sample))\n",
			
 
				+    "    for i, freq in enumerate(sampled_ids):\n",
			
 
				+    "        print(f\"{freq} x {inverse_vocab[i]}\")\n",
			
 
				+    "\n",
			
 
				+    "print_sampled_tokens(probas)"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "c63d0a27-830b-42b5-9986-6d1a7de04dd9",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "- Instead of determining the most likely token via `torch.argmax`, we use `torch.multinomial(probas, num_samples=1)` to determine the most likely token by sampling from the softmax distribution\n",
			
 
				+    "- For illustration purposes, let's see what happens when we sample the next token 1,000 times using the original softmax probabilities:"
			
 
				+   ]
			
 
				+  },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				    "id": "32e7d9cf-a26d-4d9a-8664-4af1efa73832",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				-    "- \"Temperature scaling\" is just a fancy word for diving the logits by a number greater than 0\n",
			
 
				+    "- We can control the distribution and selection process via a concept called temperature scaling\n",
			
 
				+    "- \"Temperature scaling\" is just a fancy word for dividing the logits by a number greater than 0\n",
			
 
				     "- Temperatures greater than 1 will result in more uniformly distributed token probabilities after applying the softmax\n",
			
 
				     "- Temperatures smaller than 1 will result in more confident (sharper or more peaky) distributions after applying the softmax"
			
 
				    ]
			
@@ -1549,7 +1608,7 @@
 
				     "    return torch.softmax(scaled_logits, dim=0)\n",
			
 
				     "\n",
			
 
				     "# Temperature values\n",
			
 
				-    "temperatures = [1, 0.1, 5]  # Original, higher confidence, and\n",
			
 
				+    "temperatures = [1, 0.1, 5]  # Original, higher confidence, and lower confidence\n",
			
 
				     "\n",
			
 
				     "# Calculate scaled probabilities\n",
			
 
				     "scaled_probas = [softmax_with_temperature(next_token_logits, T) for T in temperatures]"
			
@@ -1591,49 +1650,6 @@
 
				     "plt.show()"
			
 
				    ]
			
 
				   },
			
 
				-  {
			
 
				-   "cell_type": "markdown",
			
 
				-   "id": "c63d0a27-830b-42b5-9986-6d1a7de04dd9",
			
 
				-   "metadata": {},
			
 
				-   "source": [
			
 
				-    "- Instead of determining the most likely token via `torch.argmax`, we use `torch.multinomial(probas, num_samples=1)` to determine the most likely token by sampling from the softmax distribution\n",
			
 
				-    "- For illustration purposes, let's see what happens when we sample the next token 1,000 times using the original softmax probabilities:"
			
 
				-   ]
			
 
				-  },
			
 
				-  {
			
 
				-   "cell_type": "code",
			
 
				-   "execution_count": 34,
			
 
				-   "id": "b23b863e-252a-403c-b5b1-62bc0a42319f",
			
 
				-   "metadata": {},
			
 
				-   "outputs": [
			
 
				-    {
			
 
				-     "name": "stdout",
			
 
				-     "output_type": "stream",
			
 
				-     "text": [
			
 
				-      "73 x closer\n",
			
 
				-      "0 x every\n",
			
 
				-      "0 x effort\n",
			
 
				-      "582 x forward\n",
			
 
				-      "2 x inches\n",
			
 
				-      "0 x moves\n",
			
 
				-      "0 x pizza\n",
			
 
				-      "343 x toward\n"
			
 
				-     ]
			
 
				-    }
			
 
				-   ],
			
 
				-   "source": [
			
 
				-    "# Original probas\n",
			
 
				-    "\n",
			
 
				-    "def print_sampled_tokens(probas):\n",
			
 
				-    "    torch.manual_seed(123)\n",
			
 
				-    "    sample = [torch.multinomial(probas, num_samples=1).item() for i in range(1_000)]\n",
			
 
				-    "    sampled_ids = torch.bincount(torch.tensor(sample))\n",
			
 
				-    "    for i, freq in enumerate(sampled_ids):\n",
			
 
				-    "        print(f\"{freq} x {inverse_vocab[i]}\")\n",
			
 
				-    "\n",
			
 
				-    "print_sampled_tokens(probas)"
			
 
				-   ]
			
 
				-  },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				    "id": "d750e989-842a-4cfa-a44b-cf44d6e49163",