|
|
@@ -23,7 +23,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 1,
|
|
|
+ "execution_count": 2,
|
|
|
"id": "92b989e9-da36-4159-b212-799184764dd9",
|
|
|
"metadata": {},
|
|
|
"outputs": [
|
|
|
@@ -119,7 +119,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 2,
|
|
|
+ "execution_count": 3,
|
|
|
"id": "86000d74-624a-48f0-86da-f41926cb9e04",
|
|
|
"metadata": {
|
|
|
"colab": {
|
|
|
@@ -180,7 +180,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 3,
|
|
|
+ "execution_count": 4,
|
|
|
"id": "5e062b82-3540-48ce-8eb4-009686d0d16c",
|
|
|
"metadata": {},
|
|
|
"outputs": [
|
|
|
@@ -260,7 +260,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 4,
|
|
|
+ "execution_count": 5,
|
|
|
"id": "6b5402f8-ec0c-4a44-9892-18a97779ee4f",
|
|
|
"metadata": {
|
|
|
"colab": {
|
|
|
@@ -290,7 +290,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 5,
|
|
|
+ "execution_count": 6,
|
|
|
"id": "e7b6ec51-6f8c-49bd-a349-95ba38b46fb6",
|
|
|
"metadata": {},
|
|
|
"outputs": [
|
|
|
@@ -345,7 +345,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 6,
|
|
|
+ "execution_count": 7,
|
|
|
"id": "34ebd76a-16ec-4c17-8958-8a135735cc1c",
|
|
|
"metadata": {
|
|
|
"colab": {
|
|
|
@@ -385,7 +385,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 7,
|
|
|
+ "execution_count": 8,
|
|
|
"id": "c990ead6-53cd-49a7-a6d1-14d8c1518249",
|
|
|
"metadata": {},
|
|
|
"outputs": [
|
|
|
@@ -430,7 +430,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 8,
|
|
|
+ "execution_count": 9,
|
|
|
"id": "54aef09c-d6e3-4238-8653-b3a1b0a1077a",
|
|
|
"metadata": {
|
|
|
"colab": {
|
|
|
@@ -470,7 +470,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 9,
|
|
|
+ "execution_count": 10,
|
|
|
"id": "31402a67-a16e-4aeb-977e-70abb9c9949b",
|
|
|
"metadata": {
|
|
|
"colab": {
|
|
|
@@ -504,7 +504,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 10,
|
|
|
+ "execution_count": 11,
|
|
|
"id": "9b003797-161b-4d98-81dc-e68320e09fec",
|
|
|
"metadata": {
|
|
|
"colab": {
|
|
|
@@ -548,7 +548,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 11,
|
|
|
+ "execution_count": 12,
|
|
|
"id": "176ddf35-1c5f-4d7c-bf17-70f3e7069bd4",
|
|
|
"metadata": {},
|
|
|
"outputs": [
|
|
|
@@ -591,7 +591,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 12,
|
|
|
+ "execution_count": 13,
|
|
|
"id": "695d6f64-5084-4c23-aea4-105c9e38cfe4",
|
|
|
"metadata": {
|
|
|
"colab": {
|
|
|
@@ -628,7 +628,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 13,
|
|
|
+ "execution_count": 14,
|
|
|
"id": "0e17e027-ab9f-4fb5-ac9b-a009b831c122",
|
|
|
"metadata": {
|
|
|
"colab": {
|
|
|
@@ -666,7 +666,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 14,
|
|
|
+ "execution_count": 15,
|
|
|
"id": "62d0816e-b29a-4c8f-a9a5-a167562de978",
|
|
|
"metadata": {
|
|
|
"colab": {
|
|
|
@@ -700,7 +700,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 15,
|
|
|
+ "execution_count": 16,
|
|
|
"id": "168952a1-b964-4aa7-8e49-966fa26add54",
|
|
|
"metadata": {
|
|
|
"colab": {
|
|
|
@@ -764,7 +764,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 16,
|
|
|
+ "execution_count": 17,
|
|
|
"id": "654fde37-b2a9-4a20-a8d3-0206c056e2ff",
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
@@ -795,7 +795,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 17,
|
|
|
+ "execution_count": 18,
|
|
|
"id": "6kgJbe4ehI4q",
|
|
|
"metadata": {
|
|
|
"colab": {
|
|
|
@@ -821,7 +821,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 18,
|
|
|
+ "execution_count": 19,
|
|
|
"id": "j2XPde_ThM_e",
|
|
|
"metadata": {
|
|
|
"colab": {
|
|
|
@@ -847,7 +847,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 19,
|
|
|
+ "execution_count": 20,
|
|
|
"id": "6b46a952-d50a-4837-af09-4095698f7fd1",
|
|
|
"metadata": {
|
|
|
"colab": {
|
|
|
@@ -903,7 +903,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 20,
|
|
|
+ "execution_count": 21,
|
|
|
"id": "0959c855-f860-4358-8b98-bc654f047578",
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
@@ -940,7 +940,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 21,
|
|
|
+ "execution_count": 22,
|
|
|
"id": "f37b3eb0-854e-4895-9898-fa7d1e67566e",
|
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
|
@@ -977,7 +977,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 22,
|
|
|
+ "execution_count": 23,
|
|
|
"id": "ca0116d0-d229-472c-9fbf-ebc229331c3e",
|
|
|
"metadata": {},
|
|
|
"outputs": [
|
|
|
@@ -1021,7 +1021,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 23,
|
|
|
+ "execution_count": 24,
|
|
|
"id": "eb860488-5453-41d7-9870-23b723f742a0",
|
|
|
"metadata": {
|
|
|
"colab": {
|
|
|
@@ -1066,7 +1066,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 24,
|
|
|
+ "execution_count": 25,
|
|
|
"id": "7b9de31e-4096-47b3-976d-b6d2fdce04bc",
|
|
|
"metadata": {
|
|
|
"id": "7b9de31e-4096-47b3-976d-b6d2fdce04bc"
|
|
|
@@ -1110,7 +1110,7 @@
|
|
|
},
|
|
|
{
|
|
|
"cell_type": "code",
|
|
|
- "execution_count": 25,
|
|
|
+ "execution_count": 26,
|
|
|
"id": "56f5b0c9-1065-4d67-98b9-010e42fc1e2a",
|
|
|
"metadata": {},
|
|
|
"outputs": [
|
|
|
@@ -1178,8 +1178,7 @@
|
|
|
" eval_freq, eval_iter, start_context):\n",
|
|
|
" # Initialize lists to track losses and tokens seen\n",
|
|
|
" train_losses, val_losses, track_tokens_seen = [], [], []\n",
|
|
|
- " tokens_seen = 0\n",
|
|
|
- " global_step = -1\n",
|
|
|
+ " tokens_seen, global_step = 0, -1\n",
|
|
|
"\n",
|
|
|
" # Main training loop\n",
|
|
|
" for epoch in range(num_epochs):\n",
|
|
|
@@ -1408,7 +1407,7 @@
|
|
|
"metadata": {},
|
|
|
"source": [
|
|
|
"- Inference is relatively cheap with a relatively small LLM as the GPT model we trained above, so there's no need to use a GPU for it in case you used a GPU for training it above\n",
|
|
|
- "- Using the `generate_text_simple method` (from the previous chapter) that we used earlier inside the simple training function, we can generate new text one word (or token) at a time\n",
|
|
|
+ "- Using the `generate_text_simple` function (from the previous chapter) that we used earlier inside the simple training function, we can generate new text one word (or token) at a time\n",
|
|
|
"- As explained in section 5.1.2, the next generated token is the token corresponding to the largest probability score among all tokens in the vocabulary"
|
|
|
]
|
|
|
},
|
|
|
@@ -1498,8 +1497,6 @@
|
|
|
}
|
|
|
],
|
|
|
"source": [
|
|
|
- "# Assume some logits from a neural network output for 7 vocabulary tokens\n",
|
|
|
- "\n",
|
|
|
"vocab = { \n",
|
|
|
" \"closer\": 0,\n",
|
|
|
" \"every\": 1, \n",
|
|
|
@@ -1527,12 +1524,74 @@
|
|
|
"print(inverse_vocab[next_token_id])"
|
|
|
]
|
|
|
},
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 36,
|
|
|
+ "id": "6400572f-b3c8-49e2-95bc-433e55c5b3a1",
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "forward\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "torch.manual_seed(123)\n",
|
|
|
+ "next_token_id = torch.multinomial(probas, num_samples=1).item()\n",
|
|
|
+ "print(inverse_vocab[next_token_id])"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 32,
|
|
|
+ "id": "b23b863e-252a-403c-b5b1-62bc0a42319f",
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "73 x closer\n",
|
|
|
+ "0 x every\n",
|
|
|
+ "0 x effort\n",
|
|
|
+ "582 x forward\n",
|
|
|
+ "2 x inches\n",
|
|
|
+ "0 x moves\n",
|
|
|
+ "0 x pizza\n",
|
|
|
+ "343 x toward\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "def print_sampled_tokens(probas):\n",
|
|
|
+ " torch.manual_seed(123) # Manual seed for reproducibility\n",
|
|
|
+ " sample = [torch.multinomial(probas, num_samples=1).item() for i in range(1_000)]\n",
|
|
|
+ " sampled_ids = torch.bincount(torch.tensor(sample))\n",
|
|
|
+ " for i, freq in enumerate(sampled_ids):\n",
|
|
|
+ " print(f\"{freq} x {inverse_vocab[i]}\")\n",
|
|
|
+ "\n",
|
|
|
+ "print_sampled_tokens(probas)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "markdown",
|
|
|
+ "id": "c63d0a27-830b-42b5-9986-6d1a7de04dd9",
|
|
|
+ "metadata": {},
|
|
|
+ "source": [
|
|
|
+ "- Instead of determining the most likely token via `torch.argmax`, we use `torch.multinomial(probas, num_samples=1)` to determine the most likely token by sampling from the softmax distribution\n",
|
|
|
+ "- For illustration purposes, let's see what happens when we sample the next token 1,000 times using the original softmax probabilities:"
|
|
|
+ ]
|
|
|
+ },
|
|
|
{
|
|
|
"cell_type": "markdown",
|
|
|
"id": "32e7d9cf-a26d-4d9a-8664-4af1efa73832",
|
|
|
"metadata": {},
|
|
|
"source": [
|
|
|
- "- \"Temperature scaling\" is just a fancy word for diving the logits by a number greater than 0\n",
|
|
|
+ "- We can control the distribution and selection process via a concept called temperature scaling\n",
|
|
|
+ "- \"Temperature scaling\" is just a fancy word for dividing the logits by a number greater than 0\n",
|
|
|
"- Temperatures greater than 1 will result in more uniformly distributed token probabilities after applying the softmax\n",
|
|
|
"- Temperatures smaller than 1 will result in more confident (sharper or more peaky) distributions after applying the softmax"
|
|
|
]
|
|
|
@@ -1549,7 +1608,7 @@
|
|
|
" return torch.softmax(scaled_logits, dim=0)\n",
|
|
|
"\n",
|
|
|
"# Temperature values\n",
|
|
|
- "temperatures = [1, 0.1, 5] # Original, higher confidence, and\n",
|
|
|
+ "temperatures = [1, 0.1, 5] # Original, higher confidence, and lower confidence\n",
|
|
|
"\n",
|
|
|
"# Calculate scaled probabilities\n",
|
|
|
"scaled_probas = [softmax_with_temperature(next_token_logits, T) for T in temperatures]"
|
|
|
@@ -1591,49 +1650,6 @@
|
|
|
"plt.show()"
|
|
|
]
|
|
|
},
|
|
|
- {
|
|
|
- "cell_type": "markdown",
|
|
|
- "id": "c63d0a27-830b-42b5-9986-6d1a7de04dd9",
|
|
|
- "metadata": {},
|
|
|
- "source": [
|
|
|
- "- Instead of determining the most likely token via `torch.argmax`, we use `torch.multinomial(probas, num_samples=1)` to determine the most likely token by sampling from the softmax distribution\n",
|
|
|
- "- For illustration purposes, let's see what happens when we sample the next token 1,000 times using the original softmax probabilities:"
|
|
|
- ]
|
|
|
- },
|
|
|
- {
|
|
|
- "cell_type": "code",
|
|
|
- "execution_count": 34,
|
|
|
- "id": "b23b863e-252a-403c-b5b1-62bc0a42319f",
|
|
|
- "metadata": {},
|
|
|
- "outputs": [
|
|
|
- {
|
|
|
- "name": "stdout",
|
|
|
- "output_type": "stream",
|
|
|
- "text": [
|
|
|
- "73 x closer\n",
|
|
|
- "0 x every\n",
|
|
|
- "0 x effort\n",
|
|
|
- "582 x forward\n",
|
|
|
- "2 x inches\n",
|
|
|
- "0 x moves\n",
|
|
|
- "0 x pizza\n",
|
|
|
- "343 x toward\n"
|
|
|
- ]
|
|
|
- }
|
|
|
- ],
|
|
|
- "source": [
|
|
|
- "# Original probas\n",
|
|
|
- "\n",
|
|
|
- "def print_sampled_tokens(probas):\n",
|
|
|
- " torch.manual_seed(123)\n",
|
|
|
- " sample = [torch.multinomial(probas, num_samples=1).item() for i in range(1_000)]\n",
|
|
|
- " sampled_ids = torch.bincount(torch.tensor(sample))\n",
|
|
|
- " for i, freq in enumerate(sampled_ids):\n",
|
|
|
- " print(f\"{freq} x {inverse_vocab[i]}\")\n",
|
|
|
- "\n",
|
|
|
- "print_sampled_tokens(probas)"
|
|
|
- ]
|
|
|
- },
|
|
|
{
|
|
|
"cell_type": "markdown",
|
|
|
"id": "d750e989-842a-4cfa-a44b-cf44d6e49163",
|