|
|
@@ -1296,6 +1296,7 @@
|
|
|
"\n",
|
|
|
" # Tokenize the entire text\n",
|
|
|
" token_ids = tokenizer.encode(txt, allowed_special={\"<|endoftext|>\"})\n",
|
|
|
+ " assert len(token_ids) > max_length, \"Number of tokenized inputs must at least be equal to max_length+1\"\n",
|
|
|
"\n",
|
|
|
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
|
|
|
" for i in range(0, len(token_ids) - max_length, stride):\n",
|