10 months ago · dce46038da
--- a/ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb
+++ b/ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb
@@ -180,8 +180,8 @@
 
															      "name": "stderr",
														
 
															      "output_type": "stream",
														
 
															      "text": [
														
 
															-      "Fetching encoder.json: 1.04Mit [00:00, 3.47Mit/s]                                                   \n",
														
 
															-      "Fetching vocab.bpe: 457kit [00:00, 2.07Mit/s]                                                       \n"
														
 
															+      "Fetching encoder.json: 1.04Mit [00:00, 4.13Mit/s]                                                   \n",
														
 
															+      "Fetching vocab.bpe: 457kit [00:00, 2.56Mit/s]                                                       \n"
														
 
															      ]
														
 
															     }
														
 
															    ],
														
@@ -306,6 +306,39 @@
 
															     "hf_tokenizer(strings)[\"input_ids\"]"
														
 
															    ]
														
 
															   },
														
 
															+  {
														
 
															+   "cell_type": "code",
														
 
															+   "execution_count": 15,
														
 
															+   "id": "a6233552",
														
 
															+   "metadata": {},
														
 
															+   "outputs": [],
														
 
															+   "source": [
														
 
															+    "from transformers import GPT2TokenizerFast\n",
														
 
															+    "\n",
														
 
															+    "hf_tokenizer_fast = GPT2TokenizerFast.from_pretrained(\"gpt2\")"
														
 
															+   ]
														
 
															+  },
														
 
															+  {
														
 
															+   "cell_type": "code",
														
 
															+   "execution_count": 16,
														
 
															+   "id": "fa5ca643",
														
 
															+   "metadata": {},
														
 
															+   "outputs": [
														
 
															+    {
														
 
															+     "data": {
														
 
															+      "text/plain": [
														
 
															+       "[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]"
														
 
															+      ]
														
 
															+     },
														
 
															+     "execution_count": 16,
														
 
															+     "metadata": {},
														
 
															+     "output_type": "execute_result"
														
 
															+    }
														
 
															+   ],
														
 
															+   "source": [
														
 
															+    "hf_tokenizer_fast(strings)[\"input_ids\"]"
														
 
															+   ]
														
 
															+  },
														
 
															   {
														
 
															    "cell_type": "markdown",
														
 
															    "id": "9d0f2e95-8ae8-4606-a8e0-b0fce91cfac9",
														
@@ -319,7 +352,7 @@
 
															   },
														
 
															   {
														
 
															    "cell_type": "code",
														
 
															-   "execution_count": 15,
														
 
															+   "execution_count": 17,
														
 
															    "id": "b6e6b1a5-9dc0-4b20-9a8b-c02aa0e3191c",
														
 
															    "metadata": {},
														
 
															    "outputs": [],
														
@@ -365,7 +398,7 @@
 
															   },
														
 
															   {
														
 
															    "cell_type": "code",
														
 
															-   "execution_count": 16,
														
 
															+   "execution_count": 18,
														
 
															    "id": "04fbd764-ec98-44f1-9b0a-e9db9a3bb91e",
														
 
															    "metadata": {},
														
 
															    "outputs": [],
														
@@ -382,7 +415,7 @@
 
															   },
														
 
															   {
														
 
															    "cell_type": "code",
														
 
															-   "execution_count": 17,
														
 
															+   "execution_count": 19,
														
 
															    "id": "5a5def88-1d2c-4550-a5e8-ee82b72b92d7",
														
 
															    "metadata": {},
														
 
															    "outputs": [
														
@@ -413,7 +446,7 @@
 
															   },
														
 
															   {
														
 
															    "cell_type": "code",
														
 
															-   "execution_count": 18,
														
 
															+   "execution_count": 20,
														
 
															    "id": "a61bb445-b151-4a2f-8180-d4004c503754",
														
 
															    "metadata": {},
														
 
															    "outputs": [],
														
@@ -432,7 +465,7 @@
 
															   },
														
 
															   {
														
 
															    "cell_type": "code",
														
 
															-   "execution_count": 19,
														
 
															+   "execution_count": 21,
														
 
															    "id": "57f7c0a3-c1fd-4313-af34-68e78eb33653",
														
 
															    "metadata": {},
														
 
															    "outputs": [
														
@@ -440,7 +473,7 @@
 
															      "name": "stdout",
														
 
															      "output_type": "stream",
														
 
															      "text": [
														
 
															-      "3.44 ms ± 54 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
														
 
															+      "3.39 ms ± 21.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
														
 
															      ]
														
 
															     }
														
 
															    ],
														
@@ -458,7 +491,7 @@
 
															   },
														
 
															   {
														
 
															    "cell_type": "code",
														
 
															-   "execution_count": 20,
														
 
															+   "execution_count": 22,
														
 
															    "id": "036dd628-3591-46c9-a5ce-b20b105a8062",
														
 
															    "metadata": {},
														
 
															    "outputs": [
														
@@ -466,7 +499,7 @@
 
															      "name": "stdout",
														
 
															      "output_type": "stream",
														
 
															      "text": [
														
 
															-      "1.08 ms ± 4.69 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
														
 
															+      "1.08 ms ± 5.99 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
														
 
															      ]
														
 
															     }
														
 
															    ],
														
@@ -484,7 +517,7 @@
 
															   },
														
 
															   {
														
 
															    "cell_type": "code",
														
 
															-   "execution_count": 21,
														
 
															+   "execution_count": 23,
														
 
															    "id": "b9c85b58-bfbc-465e-9a7e-477e53d55c90",
														
 
															    "metadata": {},
														
 
															    "outputs": [
														
@@ -499,7 +532,7 @@
 
															      "name": "stdout",
														
 
															      "output_type": "stream",
														
 
															      "text": [
														
 
															-      "10.3 ms ± 180 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
														
 
															+      "10.2 ms ± 115 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
														
 
															      ]
														
 
															     }
														
 
															    ],
														
@@ -509,7 +542,7 @@
 
															   },
														
 
															   {
														
 
															    "cell_type": "code",
														
 
															-   "execution_count": 22,
														
 
															+   "execution_count": 24,
														
 
															    "id": "7117107f-22a6-46b4-a442-712d50b3ac7a",
														
 
															    "metadata": {},
														
 
															    "outputs": [
														
@@ -517,7 +550,7 @@
 
															      "name": "stdout",
														
 
															      "output_type": "stream",
														
 
															      "text": [
														
 
															-      "10.2 ms ± 72.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
														
 
															+      "10 ms ± 36.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
														
 
															      ]
														
 
															     }
														
 
															    ],
														
@@ -525,6 +558,49 @@
 
															     "%timeit hf_tokenizer(raw_text, max_length=5145, truncation=True)[\"input_ids\"]"
														
 
															    ]
														
 
															   },
														
 
															+  {
														
 
															+   "cell_type": "code",
														
 
															+   "execution_count": 25,
														
 
															+   "id": "d6bfc7f0",
														
 
															+   "metadata": {},
														
 
															+   "outputs": [
														
 
															+    {
														
 
															+     "name": "stderr",
														
 
															+     "output_type": "stream",
														
 
															+     "text": [
														
 
															+      "Token indices sequence length is longer than the specified maximum sequence length for this model (5145 > 1024). Running this sequence through the model will result in indexing errors\n"
														
 
															+     ]
														
 
															+    },
														
 
															+    {
														
 
															+     "name": "stdout",
														
 
															+     "output_type": "stream",
														
 
															+     "text": [
														
 
															+      "3.79 ms ± 48.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
														
 
															+     ]
														
 
															+    }
														
 
															+   ],
														
 
															+   "source": [
														
 
															+    "%timeit hf_tokenizer_fast(raw_text)[\"input_ids\"]"
														
 
															+   ]
														
 
															+  },
														
 
															+  {
														
 
															+   "cell_type": "code",
														
 
															+   "execution_count": 26,
														
 
															+   "id": "da57c95a",
														
 
															+   "metadata": {},
														
 
															+   "outputs": [
														
 
															+    {
														
 
															+     "name": "stdout",
														
 
															+     "output_type": "stream",
														
 
															+     "text": [
														
 
															+      "3.83 ms ± 58.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
														
 
															+     ]
														
 
															+    }
														
 
															+   ],
														
 
															+   "source": [
														
 
															+    "%timeit hf_tokenizer_fast(raw_text, max_length=5145, truncation=True)[\"input_ids\"]"
														
 
															+   ]
														
 
															+  },
														
 
															   {
														
 
															    "cell_type": "markdown",
														
 
															    "id": "91ac2876-f36e-498c-bd75-8597a39f2d4b",
														
@@ -535,7 +611,7 @@
 
															   },
														
 
															   {
														
 
															    "cell_type": "code",
														
 
															-   "execution_count": 23,
														
 
															+   "execution_count": 27,
														
 
															    "id": "3b4ff4d5-f2d9-4ea6-a51c-023dbba15429",
														
 
															    "metadata": {},
														
 
															    "outputs": [
														
@@ -543,7 +619,7 @@
 
															      "name": "stdout",
														
 
															      "output_type": "stream",
														
 
															      "text": [
														
 
															-      "1.74 ms ± 48.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
														
 
															+      "1.59 ms ± 11.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
														
 
															      ]
														
 
															     }
														
 
															    ],