1 rok temu · 05718c6b94
--- a/appendix-D/01_main-chapter-code/previous_chapters.py
+++ b/appendix-D/01_main-chapter-code/previous_chapters.py
@@ -25,7 +25,7 @@ class GPTDatasetV1(Dataset):
 
				         self.target_ids = []
			
 
				 
			
 
				         # Tokenize the entire text
			
 
				-        token_ids = tokenizer.encode(txt)
			
 
				+        token_ids = self.tokenizer.encode(txt)
			
 
				 
			
 
				         # Use a sliding window to chunk the book into overlapping sequences of max_length
			
 
				         for i in range(0, len(token_ids) - max_length, stride):
			
--- a/ch02/01_main-chapter-code/ch02.ipynb
+++ b/ch02/01_main-chapter-code/ch02.ipynb
@@ -1273,7 +1273,7 @@
 
				     "        self.target_ids = []\n",
			
 
				     "\n",
			
 
				     "        # Tokenize the entire text\n",
			
 
				-    "        token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n",
			
 
				+    "        token_ids = self.tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n",
			
 
				     "\n",
			
 
				     "        # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
			
 
				     "        for i in range(0, len(token_ids) - max_length, stride):\n",
			
--- a/ch02/01_main-chapter-code/dataloader.ipynb
+++ b/ch02/01_main-chapter-code/dataloader.ipynb
@@ -48,7 +48,7 @@
 
				     "        self.target_ids = []\n",
			
 
				     "\n",
			
 
				     "        # Tokenize the entire text\n",
			
 
				-    "        token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n",
			
 
				+    "        token_ids = self.tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n",
			
 
				     "\n",
			
 
				     "        # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
			
 
				     "        for i in range(0, len(token_ids) - max_length, stride):\n",
			
@@ -150,7 +150,7 @@
 
				    "name": "python",
			
 
				    "nbconvert_exporter": "python",
			
 
				    "pygments_lexer": "ipython3",
			
 
				-   "version": "3.10.6"
			
 
				+   "version": "3.10.10"
			
 
				   }
			
 
				  },
			
 
				  "nbformat": 4,
			
--- a/ch02/01_main-chapter-code/exercise-solutions.ipynb
+++ b/ch02/01_main-chapter-code/exercise-solutions.ipynb
@@ -256,7 +256,7 @@
 
				     "        self.target_ids = []\n",
			
 
				     "\n",
			
 
				     "        # Tokenize the entire text\n",
			
 
				-    "        token_ids = tokenizer.encode(txt)\n",
			
 
				+    "        token_ids = self.tokenizer.encode(txt)\n",
			
 
				     "\n",
			
 
				     "        # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
			
 
				     "        for i in range(0, len(token_ids) - max_length, stride):\n",
			
@@ -377,7 +377,7 @@
 
				    "name": "python",
			
 
				    "nbconvert_exporter": "python",
			
 
				    "pygments_lexer": "ipython3",
			
 
				-   "version": "3.10.6"
			
 
				+   "version": "3.10.10"
			
 
				   }
			
 
				  },
			
 
				  "nbformat": 4,
			
--- a/ch03/01_main-chapter-code/multihead-attention.ipynb
+++ b/ch03/01_main-chapter-code/multihead-attention.ipynb
@@ -78,7 +78,7 @@
 
				     "        self.target_ids = []\n",
			
 
				     "\n",
			
 
				     "        # Tokenize the entire text\n",
			
 
				-    "        token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n",
			
 
				+    "        token_ids = self.tokenizer.encode(txt, allowed_special={'<|endoftext|>'})\n",
			
 
				     "\n",
			
 
				     "        # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
			
 
				     "        for i in range(0, len(token_ids) - max_length, stride):\n",
			
@@ -374,7 +374,7 @@
 
				    "name": "python",
			
 
				    "nbconvert_exporter": "python",
			
 
				    "pygments_lexer": "ipython3",
			
 
				-   "version": "3.10.6"
			
 
				+   "version": "3.10.10"
			
 
				   }
			
 
				  },
			
 
				  "nbformat": 4,
			
--- a/ch04/01_main-chapter-code/gpt.py
+++ b/ch04/01_main-chapter-code/gpt.py
@@ -19,7 +19,7 @@ class GPTDatasetV1(Dataset):
 
				         self.target_ids = []
			
 
				 
			
 
				         # Tokenize the entire text
			
 
				-        token_ids = tokenizer.encode(txt)
			
 
				+        token_ids = self.tokenizer.encode(txt)
			
 
				 
			
 
				         # Use a sliding window to chunk the book into overlapping sequences of max_length
			
 
				         for i in range(0, len(token_ids) - max_length, stride):
			
--- a/ch04/01_main-chapter-code/previous_chapters.py
+++ b/ch04/01_main-chapter-code/previous_chapters.py
@@ -16,7 +16,7 @@ class GPTDatasetV1(Dataset):
 
				         self.target_ids = []
			
 
				 
			
 
				         # Tokenize the entire text
			
 
				-        token_ids = tokenizer.encode(txt)
			
 
				+        token_ids = self.tokenizer.encode(txt)
			
 
				 
			
 
				         # Use a sliding window to chunk the book into overlapping sequences of max_length
			
 
				         for i in range(0, len(token_ids) - max_length, stride):
			
--- a/ch05/01_main-chapter-code/previous_chapters.py
+++ b/ch05/01_main-chapter-code/previous_chapters.py
@@ -19,7 +19,7 @@ class GPTDatasetV1(Dataset):
 
				         self.target_ids = []
			
 
				 
			
 
				         # Tokenize the entire text
			
 
				-        token_ids = tokenizer.encode(txt)
			
 
				+        token_ids = self.tokenizer.encode(txt)
			
 
				 
			
 
				         # Use a sliding window to chunk the book into overlapping sequences of max_length
			
 
				         for i in range(0, len(token_ids) - max_length, stride):
			
--- a/ch05/02_alternative_weight_loading/previous_chapters.py
+++ b/ch05/02_alternative_weight_loading/previous_chapters.py
@@ -19,7 +19,7 @@ class GPTDatasetV1(Dataset):
 
				         self.target_ids = []
			
 
				 
			
 
				         # Tokenize the entire text
			
 
				-        token_ids = tokenizer.encode(txt)
			
 
				+        token_ids = self.tokenizer.encode(txt)
			
 
				 
			
 
				         # Use a sliding window to chunk the book into overlapping sequences of max_length
			
 
				         for i in range(0, len(token_ids) - max_length, stride):
			
--- a/ch05/03_bonus_pretraining_on_gutenberg/previous_chapters.py
+++ b/ch05/03_bonus_pretraining_on_gutenberg/previous_chapters.py
@@ -25,7 +25,7 @@ class GPTDatasetV1(Dataset):
 
				         self.input_ids = []
			
 
				         self.target_ids = []
			
 
				 
			
 
				-        token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})
			
 
				+        token_ids = self.tokenizer.encode(txt, allowed_special={'<|endoftext|>'})
			
 
				 
			
 
				         for i in range(0, len(token_ids) - max_length, stride):
			
 
				             input_chunk = token_ids[i:i + max_length]
			
--- a/ch05/05_bonus_hparam_tuning/previous_chapters.py
+++ b/ch05/05_bonus_hparam_tuning/previous_chapters.py
@@ -24,7 +24,7 @@ class GPTDatasetV1(Dataset):
 
				         self.target_ids = []
			
 
				 
			
 
				         # Tokenize the entire text
			
 
				-        token_ids = tokenizer.encode(txt)
			
 
				+        token_ids = self.tokenizer.encode(txt)
			
 
				 
			
 
				         # Use a sliding window to chunk the book into overlapping sequences of max_length
			
 
				         for i in range(0, len(token_ids) - max_length, stride):