|
@@ -2,20 +2,28 @@
|
|
|
"cells": [
|
|
"cells": [
|
|
|
{
|
|
{
|
|
|
"cell_type": "markdown",
|
|
"cell_type": "markdown",
|
|
|
- "id": "a9adc3bf-353c-411e-a471-0e92786e7103",
|
|
|
|
|
|
|
+ "id": "8a9e554f-58e3-4787-832d-d149add1b857",
|
|
|
"metadata": {},
|
|
"metadata": {},
|
|
|
"source": [
|
|
"source": [
|
|
|
- "# Using BytePair encodding from `tiktoken`"
|
|
|
|
|
|
|
+ "- Install the additional package requirements for this bonus notebook by uncommenting and running the following cell:"
|
|
|
]
|
|
]
|
|
|
},
|
|
},
|
|
|
{
|
|
{
|
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
|
- "execution_count": 1,
|
|
|
|
|
- "id": "4036ffa3-0e5c-433a-a997-4ed7d33de0b2",
|
|
|
|
|
|
|
+ "execution_count": null,
|
|
|
|
|
+ "id": "d70bae22-b540-4a13-ab01-e748cb9d55c9",
|
|
|
"metadata": {},
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
"outputs": [],
|
|
|
"source": [
|
|
"source": [
|
|
|
- "# !pip install tiktoken"
|
|
|
|
|
|
|
+ "# pip install -r requirements-extra.txt"
|
|
|
|
|
+ ]
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ "cell_type": "markdown",
|
|
|
|
|
+ "id": "a9adc3bf-353c-411e-a471-0e92786e7103",
|
|
|
|
|
+ "metadata": {},
|
|
|
|
|
+ "source": [
|
|
|
|
|
+ "# Using BytePair encodding from `tiktoken`"
|
|
|
]
|
|
]
|
|
|
},
|
|
},
|
|
|
{
|
|
{
|
|
@@ -205,16 +213,6 @@
|
|
|
"# Using the BytePair Tokenizer in HuggingFace transformers"
|
|
"# Using the BytePair Tokenizer in HuggingFace transformers"
|
|
|
]
|
|
]
|
|
|
},
|
|
},
|
|
|
- {
|
|
|
|
|
- "cell_type": "code",
|
|
|
|
|
- "execution_count": 12,
|
|
|
|
|
- "id": "5bfff386-f725-4137-9c50-e5da0c38bea0",
|
|
|
|
|
- "metadata": {},
|
|
|
|
|
- "outputs": [],
|
|
|
|
|
- "source": [
|
|
|
|
|
- "# pip install transformers"
|
|
|
|
|
- ]
|
|
|
|
|
- },
|
|
|
|
|
{
|
|
{
|
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
|
"execution_count": 13,
|
|
"execution_count": 13,
|