9 months ago · c39aa32ef5
--- a/appendix-E/01_main-chapter-code/appendix-E.ipynb
+++ b/appendix-E/01_main-chapter-code/appendix-E.ipynb
@@ -48,12 +48,12 @@
 
				      "name": "stdout",
			
 
				      "output_type": "stream",
			
 
				      "text": [
			
 
				-      "matplotlib version: 3.7.2\n",
			
 
				-      "numpy version: 1.25.2\n",
			
 
				-      "tiktoken version: 0.5.1\n",
			
 
				-      "torch version: 2.2.2\n",
			
 
				-      "tensorflow version: 2.15.0\n",
			
 
				-      "pandas version: 2.0.3\n"
			
 
				+      "matplotlib version: 3.10.0\n",
			
 
				+      "numpy version: 2.0.2\n",
			
 
				+      "tiktoken version: 0.9.0\n",
			
 
				+      "torch version: 2.6.0\n",
			
 
				+      "tensorflow version: 2.18.0\n",
			
 
				+      "pandas version: 2.2.3\n"
			
 
				      ]
			
 
				     }
			
 
				    ],
			
@@ -190,6 +190,7 @@
 
				     }
			
 
				    ],
			
 
				    "source": [
			
 
				+    "import urllib\n",
			
 
				     "from pathlib import Path\n",
			
 
				     "import pandas as pd\n",
			
 
				     "from previous_chapters import (\n",
			
@@ -204,7 +205,13 @@
 
				     "extracted_path = \"sms_spam_collection\"\n",
			
 
				     "data_file_path = Path(extracted_path) / \"SMSSpamCollection.tsv\"\n",
			
 
				     "\n",
			
 
				-    "download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
			
 
				+    "try:\n",
			
 
				+    "    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
			
 
				+    "except urllib.error.HTTPError:\n",
			
 
				+    "    print(\"UCI Machine Learning Repository (https://archive.ics.uci.edu)\"\n",
			
 
				+    "          \" temporary unavailable. Using backup URL.\")\n",
			
 
				+    "    url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
			
 
				+    "    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
			
 
				     "\n",
			
 
				     "df = pd.read_csv(data_file_path, sep=\"\\t\", header=None, names=[\"Label\", \"Text\"])\n",
			
 
				     "balanced_df = create_balanced_dataset(df)\n",
			
--- a/ch06/01_main-chapter-code/ch06.ipynb
+++ b/ch06/01_main-chapter-code/ch06.ipynb
@@ -50,7 +50,7 @@
 
				      "text": [
			
 
				       "matplotlib version: 3.10.0\n",
			
 
				       "numpy version: 2.0.2\n",
			
 
				-      "tiktoken version: 0.8.0\n",
			
 
				+      "tiktoken version: 0.9.0\n",
			
 
				       "torch version: 2.6.0\n",
			
 
				       "tensorflow version: 2.18.0\n",
			
 
				       "pandas version: 2.2.3\n"
			
@@ -167,7 +167,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 3,
			
 
				+   "execution_count": 2,
			
 
				    "id": "def7c09b-af9c-4216-90ce-5e67aed1065c",
			
 
				    "metadata": {
			
 
				     "colab": {
			
@@ -181,7 +181,7 @@
 
				      "name": "stdout",
			
 
				      "output_type": "stream",
			
 
				      "text": [
			
 
				-      "sms_spam_collection/SMSSpamCollection.tsv already exists. Skipping download and extraction.\n"
			
 
				+      "File downloaded and saved as sms_spam_collection/SMSSpamCollection.tsv\n"
			
 
				      ]
			
 
				     }
			
 
				    ],
			
@@ -215,7 +215,13 @@
 
				     "    os.rename(original_file_path, data_file_path)\n",
			
 
				     "    print(f\"File downloaded and saved as {data_file_path}\")\n",
			
 
				     "\n",
			
 
				-    "download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)"
			
 
				+    "try:\n",
			
 
				+    "    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
			
 
				+    "except urllib.error.HTTPError:\n",
			
 
				+    "    print(\"UCI Machine Learning Repository (https://archive.ics.uci.edu)\"\n",
			
 
				+    "          \" temporary unavailable. Using backup URL.\")\n",
			
 
				+    "    url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
			
 
				+    "    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) "
			
 
				    ]
			
 
				   },
			
 
				   {
			
--- a/ch06/01_main-chapter-code/gpt_class_finetune.py
+++ b/ch06/01_main-chapter-code/gpt_class_finetune.py
@@ -276,7 +276,16 @@ if __name__ == "__main__":
 
				     extracted_path = "sms_spam_collection"
			
 
				     data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"
			
 
				 
			
 
				-    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path, test_mode=args.test_mode)
			
 
				+    try:
			
 
				+        download_and_unzip_spam_data(
			
 
				+            url, zip_path, extracted_path, data_file_path, test_mode=args.test_mode
			
 
				+        )
			
 
				+    except urllib.error.HTTPError:
			
 
				+        backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
			
 
				+        download_and_unzip_spam_data(
			
 
				+            backup_url, zip_path, extracted_path, data_file_path, test_mode=args.test_mode
			
 
				+        )
			
 
				+
			
 
				     df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])
			
 
				     balanced_df = create_balanced_dataset(df)
			
 
				     balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})
			
--- a/ch06/02_bonus_additional-experiments/additional_experiments.py
+++ b/ch06/02_bonus_additional-experiments/additional_experiments.py
@@ -603,7 +603,11 @@ if __name__ == "__main__":
 
				     all_exist = all((base_path / file_name).exists() for file_name in file_names)
			
 
				 
			
 
				     if not all_exist:
			
 
				-        download_and_unzip(url, zip_path, extract_to, new_file_path)
			
 
				+        try:
			
 
				+            download_and_unzip(url, zip_path, extract_to, new_file_path)
			
 
				+        except urllib.error.HTTPError:
			
 
				+            backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
			
 
				+            download_and_unzip(backup_url, zip_path, extract_to, new_file_path)
			
 
				         create_dataset_csvs(new_file_path)
			
 
				 
			
 
				     tokenizer = tiktoken.get_encoding("gpt2")
			
--- a/ch06/03_bonus_imdb-classification/train_bert_hf_spam.py
+++ b/ch06/03_bonus_imdb-classification/train_bert_hf_spam.py
@@ -410,7 +410,11 @@ if __name__ == "__main__":
 
				     all_exist = all((base_path / file_name).exists() for file_name in file_names)
			
 
				 
			
 
				     if not all_exist:
			
 
				-        download_and_unzip(url, zip_path, extract_to, new_file_path)
			
 
				+        try:
			
 
				+            download_and_unzip(url, zip_path, extract_to, new_file_path)
			
 
				+        except urllib.error.HTTPError:
			
 
				+            backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
			
 
				+            download_and_unzip(backup_url, zip_path, extract_to, new_file_path)
			
 
				         create_dataset_csvs(new_file_path)
			
 
				 
			
 
				     if args.use_attention_mask.lower() == "true":