|
|
@@ -234,7 +234,7 @@
|
|
|
"\n",
|
|
|
"GPT_CONFIG_124M = {\n",
|
|
|
" \"vocab_size\": 50257, # Vocabulary size\n",
|
|
|
- " \"ctx_len\": 256, # Shortened context length (orig: 1024)\n",
|
|
|
+ " \"context_length\": 256, # Shortened context length (orig: 1024)\n",
|
|
|
" \"emb_dim\": 768, # Embedding dimension\n",
|
|
|
" \"n_heads\": 12, # Number of attention heads\n",
|
|
|
" \"n_layers\": 12, # Number of layers\n",
|
|
|
@@ -286,7 +286,7 @@
|
|
|
" model=model,\n",
|
|
|
" idx=text_to_token_ids(start_context, tokenizer),\n",
|
|
|
" max_new_tokens=25,\n",
|
|
|
- " context_size=GPT_CONFIG_124M[\"ctx_len\"]\n",
|
|
|
+ " context_size=GPT_CONFIG_124M[\"context_length\"]\n",
|
|
|
")\n",
|
|
|
"\n",
|
|
|
"print(\"Output text:\\n\", token_ids_to_text(token_ids, tokenizer))"
|
|
|
@@ -314,7 +314,7 @@
|
|
|
" model=model,\n",
|
|
|
" idx=text_to_token_ids(\"Every effort moves you\", tokenizer),\n",
|
|
|
" max_new_tokens=25,\n",
|
|
|
- " context_size=GPT_CONFIG_124M[\"ctx_len\"],\n",
|
|
|
+ " context_size=GPT_CONFIG_124M[\"context_length\"],\n",
|
|
|
" top_k=None,\n",
|
|
|
" temperature=0.0\n",
|
|
|
")\n",
|
|
|
@@ -344,7 +344,7 @@
|
|
|
" model=model,\n",
|
|
|
" idx=text_to_token_ids(\"Every effort moves you\", tokenizer),\n",
|
|
|
" max_new_tokens=25,\n",
|
|
|
- " context_size=GPT_CONFIG_124M[\"ctx_len\"],\n",
|
|
|
+ " context_size=GPT_CONFIG_124M[\"context_length\"],\n",
|
|
|
" top_k=None,\n",
|
|
|
" temperature=0.0\n",
|
|
|
")\n",
|
|
|
@@ -383,13 +383,13 @@
|
|
|
"\n",
|
|
|
"\n",
|
|
|
"GPT_CONFIG_124M = {\n",
|
|
|
- " \"vocab_size\": 50257, # Vocabulary size\n",
|
|
|
- " \"ctx_len\": 256, # Shortened context length (orig: 1024)\n",
|
|
|
- " \"emb_dim\": 768, # Embedding dimension\n",
|
|
|
- " \"n_heads\": 12, # Number of attention heads\n",
|
|
|
- " \"n_layers\": 12, # Number of layers\n",
|
|
|
- " \"drop_rate\": 0.1, # Dropout rate\n",
|
|
|
- " \"qkv_bias\": False # Query-key-value bias\n",
|
|
|
+ " \"vocab_size\": 50257, # Vocabulary size\n",
|
|
|
+ " \"context_length\": 256, # Shortened context length (orig: 1024)\n",
|
|
|
+ " \"emb_dim\": 768, # Embedding dimension\n",
|
|
|
+ " \"n_heads\": 12, # Number of attention heads\n",
|
|
|
+ " \"n_layers\": 12, # Number of layers\n",
|
|
|
+ " \"drop_rate\": 0.1, # Dropout rate\n",
|
|
|
+ " \"qkv_bias\": False # Query-key-value bias\n",
|
|
|
"}\n",
|
|
|
"\n",
|
|
|
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
|
|
@@ -451,8 +451,8 @@
|
|
|
"train_loader = create_dataloader_v1(\n",
|
|
|
" train_data,\n",
|
|
|
" batch_size=2,\n",
|
|
|
- " max_length=GPT_CONFIG_124M[\"ctx_len\"],\n",
|
|
|
- " stride=GPT_CONFIG_124M[\"ctx_len\"],\n",
|
|
|
+ " max_length=GPT_CONFIG_124M[\"context_length\"],\n",
|
|
|
+ " stride=GPT_CONFIG_124M[\"context_length\"],\n",
|
|
|
" drop_last=True,\n",
|
|
|
" shuffle=True\n",
|
|
|
")\n",
|
|
|
@@ -460,8 +460,8 @@
|
|
|
"val_loader = create_dataloader_v1(\n",
|
|
|
" val_data,\n",
|
|
|
" batch_size=2,\n",
|
|
|
- " max_length=GPT_CONFIG_124M[\"ctx_len\"],\n",
|
|
|
- " stride=GPT_CONFIG_124M[\"ctx_len\"],\n",
|
|
|
+ " max_length=GPT_CONFIG_124M[\"context_length\"],\n",
|
|
|
+ " stride=GPT_CONFIG_124M[\"context_length\"],\n",
|
|
|
" drop_last=False,\n",
|
|
|
" shuffle=False\n",
|
|
|
")"
|
|
|
@@ -557,13 +557,13 @@
|
|
|
"\n",
|
|
|
"\n",
|
|
|
"GPT_CONFIG_124M = {\n",
|
|
|
- " \"vocab_size\": 50257, # Vocabulary size\n",
|
|
|
- " \"ctx_len\": 256, # Shortened context length (orig: 1024)\n",
|
|
|
- " \"emb_dim\": 768, # Embedding dimension\n",
|
|
|
- " \"n_heads\": 12, # Number of attention heads\n",
|
|
|
- " \"n_layers\": 12, # Number of layers\n",
|
|
|
- " \"drop_rate\": 0.1, # Dropout rate\n",
|
|
|
- " \"qkv_bias\": False # Query-key-value bias\n",
|
|
|
+ " \"vocab_size\": 50257, # Vocabulary size\n",
|
|
|
+ " \"context_length\": 256, # Shortened context length (orig: 1024)\n",
|
|
|
+ " \"emb_dim\": 768, # Embedding dimension\n",
|
|
|
+ " \"n_heads\": 12, # Number of attention heads\n",
|
|
|
+ " \"n_layers\": 12, # Number of layers\n",
|
|
|
+ " \"drop_rate\": 0.1, # Dropout rate\n",
|
|
|
+ " \"qkv_bias\": False # Query-key-value bias\n",
|
|
|
"}\n",
|
|
|
"\n",
|
|
|
"\n",
|
|
|
@@ -617,7 +617,7 @@
|
|
|
"model_name = \"gpt2-small (124M)\" # Example model name\n",
|
|
|
"NEW_CONFIG = GPT_CONFIG_124M.copy()\n",
|
|
|
"NEW_CONFIG.update(model_configs[model_name])\n",
|
|
|
- "NEW_CONFIG.update({\"ctx_len\": 1024, \"qkv_bias\": True})\n",
|
|
|
+ "NEW_CONFIG.update({\"context_length\": 1024, \"qkv_bias\": True})\n",
|
|
|
"\n",
|
|
|
"gpt = GPTModel(NEW_CONFIG)\n",
|
|
|
"gpt.eval();"
|
|
|
@@ -675,8 +675,8 @@
|
|
|
"train_loader = create_dataloader_v1(\n",
|
|
|
" train_data,\n",
|
|
|
" batch_size=2,\n",
|
|
|
- " max_length=GPT_CONFIG_124M[\"ctx_len\"],\n",
|
|
|
- " stride=GPT_CONFIG_124M[\"ctx_len\"],\n",
|
|
|
+ " max_length=GPT_CONFIG_124M[\"context_length\"],\n",
|
|
|
+ " stride=GPT_CONFIG_124M[\"context_length\"],\n",
|
|
|
" drop_last=True,\n",
|
|
|
" shuffle=True\n",
|
|
|
")\n",
|
|
|
@@ -684,8 +684,8 @@
|
|
|
"val_loader = create_dataloader_v1(\n",
|
|
|
" val_data,\n",
|
|
|
" batch_size=2,\n",
|
|
|
- " max_length=GPT_CONFIG_124M[\"ctx_len\"],\n",
|
|
|
- " stride=GPT_CONFIG_124M[\"ctx_len\"],\n",
|
|
|
+ " max_length=GPT_CONFIG_124M[\"context_length\"],\n",
|
|
|
+ " stride=GPT_CONFIG_124M[\"context_length\"],\n",
|
|
|
" drop_last=False,\n",
|
|
|
" shuffle=False\n",
|
|
|
")"
|
|
|
@@ -753,7 +753,7 @@
|
|
|
"model_name = \"gpt2-xl (1558M)\"\n",
|
|
|
"NEW_CONFIG = GPT_CONFIG_124M.copy()\n",
|
|
|
"NEW_CONFIG.update(model_configs[model_name])\n",
|
|
|
- "NEW_CONFIG.update({\"ctx_len\": 1024, \"qkv_bias\": True})\n",
|
|
|
+ "NEW_CONFIG.update({\"context_length\": 1024, \"qkv_bias\": True})\n",
|
|
|
"\n",
|
|
|
"gpt = GPTModel(NEW_CONFIG)\n",
|
|
|
"gpt.eval();\n",
|
|
|
@@ -811,13 +811,13 @@
|
|
|
"\n",
|
|
|
"\n",
|
|
|
"GPT_CONFIG_124M = {\n",
|
|
|
- " \"vocab_size\": 50257, # Vocabulary size\n",
|
|
|
- " \"ctx_len\": 256, # Shortened context length (orig: 1024)\n",
|
|
|
- " \"emb_dim\": 768, # Embedding dimension\n",
|
|
|
- " \"n_heads\": 12, # Number of attention heads\n",
|
|
|
- " \"n_layers\": 12, # Number of layers\n",
|
|
|
- " \"drop_rate\": 0.1, # Dropout rate\n",
|
|
|
- " \"qkv_bias\": False # Query-key-value bias\n",
|
|
|
+ " \"vocab_size\": 50257, # Vocabulary size\n",
|
|
|
+ " \"context_length\": 256, # Shortened context length (orig: 1024)\n",
|
|
|
+ " \"emb_dim\": 768, # Embedding dimension\n",
|
|
|
+ " \"n_heads\": 12, # Number of attention heads\n",
|
|
|
+ " \"n_layers\": 12, # Number of layers\n",
|
|
|
+ " \"drop_rate\": 0.1, # Dropout rate\n",
|
|
|
+ " \"qkv_bias\": False # Query-key-value bias\n",
|
|
|
"}\n",
|
|
|
"\n",
|
|
|
"\n",
|
|
|
@@ -859,7 +859,7 @@
|
|
|
"model_name = \"gpt2-xl (1558M)\"\n",
|
|
|
"NEW_CONFIG = GPT_CONFIG_124M.copy()\n",
|
|
|
"NEW_CONFIG.update(model_configs[model_name])\n",
|
|
|
- "NEW_CONFIG.update({\"ctx_len\": 1024, \"qkv_bias\": True})\n",
|
|
|
+ "NEW_CONFIG.update({\"context_length\": 1024, \"qkv_bias\": True})\n",
|
|
|
"\n",
|
|
|
"gpt = GPTModel(NEW_CONFIG)\n",
|
|
|
"gpt.eval()\n",
|
|
|
@@ -901,7 +901,7 @@
|
|
|
" model=gpt,\n",
|
|
|
" idx=text_to_token_ids(\"Every effort moves you\", tokenizer),\n",
|
|
|
" max_new_tokens=25,\n",
|
|
|
- " context_size=NEW_CONFIG[\"ctx_len\"],\n",
|
|
|
+ " context_size=NEW_CONFIG[\"context_length\"],\n",
|
|
|
" top_k=50,\n",
|
|
|
" temperature=1.5\n",
|
|
|
")\n",
|
|
|
@@ -926,7 +926,7 @@
|
|
|
"name": "python",
|
|
|
"nbconvert_exporter": "python",
|
|
|
"pygments_lexer": "ipython3",
|
|
|
- "version": "3.11.4"
|
|
|
+ "version": "3.10.6"
|
|
|
}
|
|
|
},
|
|
|
"nbformat": 4,
|