Sebastian Raschka 3 месяцев назад
Родитель
Сommit
e9c1c1da38
2 измененных файлов с 2 добавлено и 2 удалено
  1. 1 1
      ch05/11_qwen3/standalone-qwen3.ipynb
  2. 1 1
      pkg/llms_from_scratch/qwen3.py

+ 1 - 1
ch05/11_qwen3/standalone-qwen3.ipynb

@@ -436,7 +436,7 @@
     "        \"n_layers\": 28,                  # Number of layers\n",
     "        \"n_layers\": 28,                  # Number of layers\n",
     "        \"hidden_dim\": 3072,              # Size of the intermediate dimension in FeedForward\n",
     "        \"hidden_dim\": 3072,              # Size of the intermediate dimension in FeedForward\n",
     "        \"head_dim\": 128,                 # Size of the heads in GQA\n",
     "        \"head_dim\": 128,                 # Size of the heads in GQA\n",
-    "        \"qk_norm\": True,                 # Whether to normalize queries and values in GQA\n",
+    "        \"qk_norm\": True,                 # Whether to normalize queries and keys in GQA\n",
     "        \"n_kv_groups\": 8,                # Key-Value groups for grouped-query attention\n",
     "        \"n_kv_groups\": 8,                # Key-Value groups for grouped-query attention\n",
     "        \"rope_base\": 1_000_000.0,        # The base in RoPE's \"theta\"\n",
     "        \"rope_base\": 1_000_000.0,        # The base in RoPE's \"theta\"\n",
     "        \"dtype\": torch.bfloat16,         # Lower-precision dtype to reduce memory usage\n",
     "        \"dtype\": torch.bfloat16,         # Lower-precision dtype to reduce memory usage\n",

+ 1 - 1
pkg/llms_from_scratch/qwen3.py

@@ -22,7 +22,7 @@ QWEN_CONFIG_06_B = {
     "n_layers": 28,                  # Number of layers
     "n_layers": 28,                  # Number of layers
     "hidden_dim": 3072,              # Size of the intermediate dimension in FeedForward
     "hidden_dim": 3072,              # Size of the intermediate dimension in FeedForward
     "head_dim": 128,                 # Size of the heads in GQA
     "head_dim": 128,                 # Size of the heads in GQA
-    "qk_norm": True,                 # Whether to normalize queries and values in GQA
+    "qk_norm": True,                 # Whether to normalize queries and keys in GQA
     "n_kv_groups": 8,                # Key-Value groups for grouped-query attention
     "n_kv_groups": 8,                # Key-Value groups for grouped-query attention
     "rope_base": 1_000_000.0,        # The base in RoPE's "theta"
     "rope_base": 1_000_000.0,        # The base in RoPE's "theta"
     "dtype": torch.bfloat16,         # Lower-precision dtype to reduce memory usage
     "dtype": torch.bfloat16,         # Lower-precision dtype to reduce memory usage