Browse Source

Qwen3 tokenizer sanity checks (#730)

Sebastian Raschka 4 months ago
parent
commit
b8c8237251
1 changed files with 8 additions and 0 deletions
  1. 8 0
      pkg/llms_from_scratch/tests/test_qwen3.py

+ 8 - 0
pkg/llms_from_scratch/tests/test_qwen3.py

@@ -279,3 +279,11 @@ def test_tokenizer_equivalence():
 
         assert tokenizer_ref.eos_token_id == tokenizer.eos_token_id
         assert tokenizer_ref.pad_token_id == tokenizer.pad_token_id
+
+        assert tokenizer.encode("<|endoftext|>") == [tokenizer._special_to_id["<|endoftext|>"]]
+        assert tokenizer.encode("<|im_end|>") == [tokenizer._special_to_id["<|im_end|>"]]
+
+        expected_eos_token = "<|im_end|>" if "Base" not in repo_id else "<|endoftext|>"
+        expected_pad_token = "<|endoftext|>"
+        assert tokenizer.decode([tokenizer.eos_token_id]) == expected_eos_token
+        assert tokenizer.decode([tokenizer.pad_token_id]) == expected_pad_token