|
|
@@ -48,12 +48,12 @@
|
|
|
"name": "stdout",
|
|
|
"output_type": "stream",
|
|
|
"text": [
|
|
|
- "matplotlib version: 3.7.2\n",
|
|
|
- "numpy version: 1.25.2\n",
|
|
|
- "tiktoken version: 0.5.1\n",
|
|
|
- "torch version: 2.2.2\n",
|
|
|
- "tensorflow version: 2.15.0\n",
|
|
|
- "pandas version: 2.0.3\n"
|
|
|
+ "matplotlib version: 3.10.0\n",
|
|
|
+ "numpy version: 2.0.2\n",
|
|
|
+ "tiktoken version: 0.9.0\n",
|
|
|
+ "torch version: 2.6.0\n",
|
|
|
+ "tensorflow version: 2.18.0\n",
|
|
|
+ "pandas version: 2.2.3\n"
|
|
|
]
|
|
|
}
|
|
|
],
|
|
|
@@ -190,6 +190,7 @@
|
|
|
}
|
|
|
],
|
|
|
"source": [
|
|
|
+ "import urllib\n",
|
|
|
"from pathlib import Path\n",
|
|
|
"import pandas as pd\n",
|
|
|
"from previous_chapters import (\n",
|
|
|
@@ -204,7 +205,13 @@
|
|
|
"extracted_path = \"sms_spam_collection\"\n",
|
|
|
"data_file_path = Path(extracted_path) / \"SMSSpamCollection.tsv\"\n",
|
|
|
"\n",
|
|
|
- "download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
|
|
|
+ "try:\n",
|
|
|
+ " download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
|
|
|
+ "except urllib.error.HTTPError:\n",
|
|
|
+ " print(\"UCI Machine Learning Repository (https://archive.ics.uci.edu)\"\n",
|
|
|
+ " \" temporary unavailable. Using backup URL.\")\n",
|
|
|
+ " url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
|
|
|
+ " download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
|
|
|
"\n",
|
|
|
"df = pd.read_csv(data_file_path, sep=\"\\t\", header=None, names=[\"Label\", \"Text\"])\n",
|
|
|
"balanced_df = create_balanced_dataset(df)\n",
|