|
|
@@ -436,7 +436,7 @@
|
|
|
" \"n_layers\": 28, # Number of layers\n",
|
|
|
" \"hidden_dim\": 3072, # Size of the intermediate dimension in FeedForward\n",
|
|
|
" \"head_dim\": 128, # Size of the heads in GQA\n",
|
|
|
- " \"qk_norm\": True, # Whether to normalize queries and values in GQA\n",
|
|
|
+ " \"qk_norm\": True, # Whether to normalize queries and keys in GQA\n",
|
|
|
" \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n",
|
|
|
" \"rope_base\": 1_000_000.0, # The base in RoPE's \"theta\"\n",
|
|
|
" \"dtype\": torch.bfloat16, # Lower-precision dtype to reduce memory usage\n",
|