1 anno fa · 72a073bbbf
--- a/appendix-D/01_main-chapter-code/previous_chapters.py
+++ b/appendix-D/01_main-chapter-code/previous_chapters.py
@@ -24,7 +24,7 @@ class GPTDatasetV1(Dataset):
 
				         self.target_ids = []
			
 
				 
			
 
				         # Tokenize the entire text
			
 
				-        token_ids = tokenizer.encode(txt)
			
 
				+        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
			
 
				 
			
 
				         # Use a sliding window to chunk the book into overlapping sequences of max_length
			
 
				         for i in range(0, len(token_ids) - max_length, stride):
			
--- a/appendix-E/01_main-chapter-code/previous_chapters.py
+++ b/appendix-E/01_main-chapter-code/previous_chapters.py
@@ -28,12 +28,11 @@ from torch.utils.data import Dataset, DataLoader
 
				 
			
 
				 class GPTDatasetV1(Dataset):
			
 
				     def __init__(self, txt, tokenizer, max_length, stride):
			
 
				-        self.tokenizer = tokenizer
			
 
				         self.input_ids = []
			
 
				         self.target_ids = []
			
 
				 
			
 
				         # Tokenize the entire text
			
 
				-        token_ids = tokenizer.encode(txt)
			
 
				+        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
			
 
				 
			
 
				         # Use a sliding window to chunk the book into overlapping sequences of max_length
			
 
				         for i in range(0, len(token_ids) - max_length, stride):
			
--- a/ch02/01_main-chapter-code/ch02.ipynb
+++ b/ch02/01_main-chapter-code/ch02.ipynb
@@ -1920,7 +1920,7 @@
 
				    "name": "python",
			
 
				    "nbconvert_exporter": "python",
			
 
				    "pygments_lexer": "ipython3",
			
 
				-   "version": "3.10.6"
			
 
				+   "version": "3.11.4"
			
 
				   }
			
 
				  },
			
 
				  "nbformat": 4,
			
--- a/ch02/01_main-chapter-code/exercise-solutions.ipynb
+++ b/ch02/01_main-chapter-code/exercise-solutions.ipynb
@@ -248,7 +248,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 10,
			
 
				+   "execution_count": 11,
			
 
				    "id": "4d50af16-937b-49e0-8ffd-42d30cbb41c9",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
@@ -260,12 +260,11 @@
 
				     "\n",
			
 
				     "class GPTDatasetV1(Dataset):\n",
			
 
				     "    def __init__(self, txt, tokenizer, max_length, stride):\n",
			
 
				-    "        self.tokenizer = tokenizer\n",
			
 
				     "        self.input_ids = []\n",
			
 
				     "        self.target_ids = []\n",
			
 
				     "\n",
			
 
				     "        # Tokenize the entire text\n",
			
 
				-    "        token_ids = self.tokenizer.encode(txt)\n",
			
 
				+    "        token_ids = tokenizer.encode(txt)\n",
			
 
				     "\n",
			
 
				     "        # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
			
 
				     "        for i in range(0, len(token_ids) - max_length, stride):\n",
			
@@ -311,7 +310,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 11,
			
 
				+   "execution_count": 12,
			
 
				    "id": "0128eefa-d7c8-4f76-9851-566dfa7c3745",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
@@ -324,7 +323,7 @@
 
				        "        [ 402,  271]])"
			
 
				       ]
			
 
				      },
			
 
				-     "execution_count": 11,
			
 
				+     "execution_count": 12,
			
 
				      "metadata": {},
			
 
				      "output_type": "execute_result"
			
 
				     }
			
@@ -341,7 +340,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 12,
			
 
				+   "execution_count": 13,
			
 
				    "id": "ff5c1e90-c6de-4a87-adf6-7e19f603291c",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
@@ -354,7 +353,7 @@
 
				        "        [  402,   271, 10899,  2138,   257,  7026, 15632,   438]])"
			
 
				       ]
			
 
				      },
			
 
				-     "execution_count": 12,
			
 
				+     "execution_count": 13,
			
 
				      "metadata": {},
			
 
				      "output_type": "execute_result"
			
 
				     }
			
--- a/ch03/01_main-chapter-code/multihead-attention.ipynb
+++ b/ch03/01_main-chapter-code/multihead-attention.ipynb
@@ -82,12 +82,11 @@
 
				     "\n",
			
 
				     "class GPTDatasetV1(Dataset):\n",
			
 
				     "    def __init__(self, txt, tokenizer, max_length, stride):\n",
			
 
				-    "        self.tokenizer = tokenizer\n",
			
 
				     "        self.input_ids = []\n",
			
 
				     "        self.target_ids = []\n",
			
 
				     "\n",
			
 
				     "        # Tokenize the entire text\n",
			
 
				-    "        token_ids = self.tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n",
			
 
				+    "        token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n",
			
 
				     "\n",
			
 
				     "        # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
			
 
				     "        for i in range(0, len(token_ids) - max_length, stride):\n",
			
--- a/ch04/01_main-chapter-code/previous_chapters.py
+++ b/ch04/01_main-chapter-code/previous_chapters.py
@@ -15,7 +15,7 @@ class GPTDatasetV1(Dataset):
 
				         self.target_ids = []
			
 
				 
			
 
				         # Tokenize the entire text
			
 
				-        token_ids = self.tokenizer.encode(txt)
			
 
				+        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
			
 
				 
			
 
				         # Use a sliding window to chunk the book into overlapping sequences of max_length
			
 
				         for i in range(0, len(token_ids) - max_length, stride):
			
--- a/ch04/02_performance-analysis/previous_chapters.py
+++ b/ch04/02_performance-analysis/previous_chapters.py
@@ -23,7 +23,7 @@ class GPTDatasetV1(Dataset):
 
				         self.target_ids = []
			
 
				 
			
 
				         # Tokenize the entire text
			
 
				-        token_ids = tokenizer.encode(txt)
			
 
				+        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
			
 
				 
			
 
				         # Use a sliding window to chunk the book into overlapping sequences of max_length
			
 
				         for i in range(0, len(token_ids) - max_length, stride):
			
--- a/ch05/01_main-chapter-code/previous_chapters.py
+++ b/ch05/01_main-chapter-code/previous_chapters.py
@@ -23,7 +23,7 @@ class GPTDatasetV1(Dataset):
 
				         self.target_ids = []
			
 
				 
			
 
				         # Tokenize the entire text
			
 
				-        token_ids = tokenizer.encode(txt)
			
 
				+        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
			
 
				 
			
 
				         # Use a sliding window to chunk the book into overlapping sequences of max_length
			
 
				         for i in range(0, len(token_ids) - max_length, stride):
			
--- a/ch05/02_alternative_weight_loading/previous_chapters.py
+++ b/ch05/02_alternative_weight_loading/previous_chapters.py
@@ -23,7 +23,7 @@ class GPTDatasetV1(Dataset):
 
				         self.target_ids = []
			
 
				 
			
 
				         # Tokenize the entire text
			
 
				-        token_ids = tokenizer.encode(txt)
			
 
				+        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
			
 
				 
			
 
				         # Use a sliding window to chunk the book into overlapping sequences of max_length
			
 
				         for i in range(0, len(token_ids) - max_length, stride):
			
--- a/ch05/05_bonus_hparam_tuning/previous_chapters.py
+++ b/ch05/05_bonus_hparam_tuning/previous_chapters.py
@@ -23,7 +23,7 @@ class GPTDatasetV1(Dataset):
 
				         self.target_ids = []
			
 
				 
			
 
				         # Tokenize the entire text
			
 
				-        token_ids = tokenizer.encode(txt)
			
 
				+        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
			
 
				 
			
 
				         # Use a sliding window to chunk the book into overlapping sequences of max_length
			
 
				         for i in range(0, len(token_ids) - max_length, stride):
			
--- a/ch06/01_main-chapter-code/previous_chapters.py
+++ b/ch06/01_main-chapter-code/previous_chapters.py
@@ -20,12 +20,11 @@ from torch.utils.data import Dataset, DataLoader
 
				 
			
 
				 class GPTDatasetV1(Dataset):
			
 
				     def __init__(self, txt, tokenizer, max_length, stride):
			
 
				-        self.tokenizer = tokenizer
			
 
				         self.input_ids = []
			
 
				         self.target_ids = []
			
 
				 
			
 
				         # Tokenize the entire text
			
 
				-        token_ids = tokenizer.encode(txt)
			
 
				+        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
			
 
				 
			
 
				         # Use a sliding window to chunk the book into overlapping sequences of max_length
			
 
				         for i in range(0, len(token_ids) - max_length, stride):
			
--- a/ch06/02_bonus_additional-experiments/previous_chapters.py
+++ b/ch06/02_bonus_additional-experiments/previous_chapters.py
@@ -20,12 +20,11 @@ from torch.utils.data import Dataset, DataLoader
 
				 
			
 
				 class GPTDatasetV1(Dataset):
			
 
				     def __init__(self, txt, tokenizer, max_length, stride):
			
 
				-        self.tokenizer = tokenizer
			
 
				         self.input_ids = []
			
 
				         self.target_ids = []
			
 
				 
			
 
				         # Tokenize the entire text
			
 
				-        token_ids = tokenizer.encode(txt)
			
 
				+        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
			
 
				 
			
 
				         # Use a sliding window to chunk the book into overlapping sequences of max_length
			
 
				         for i in range(0, len(token_ids) - max_length, stride):
			
--- a/ch06/03_bonus_imdb-classification/previous_chapters.py
+++ b/ch06/03_bonus_imdb-classification/previous_chapters.py
@@ -25,7 +25,7 @@ class GPTDatasetV1(Dataset):
 
				         self.target_ids = []
			
 
				 
			
 
				         # Tokenize the entire text
			
 
				-        token_ids = tokenizer.encode(txt)
			
 
				+        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
			
 
				 
			
 
				         # Use a sliding window to chunk the book into overlapping sequences of max_length
			
 
				         for i in range(0, len(token_ids) - max_length, stride):