generate.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
  2. # Source for "Build a Large Language Model From Scratch"
  3. # - https://www.manning.com/books/build-a-large-language-model-from-scratch
  4. # Code: https://github.com/rasbt/LLMs-from-scratch
  5. import json
  6. import numpy as np
  7. import os
  8. import requests
  9. import tensorflow as tf
  10. import tiktoken
  11. import torch
  12. from tqdm import tqdm
  13. # Import from local files
  14. from previous_chapters import GPTModel
  15. def text_to_token_ids(text, tokenizer):
  16. encoded = tokenizer.encode(text)
  17. encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
  18. return encoded_tensor
  19. def token_ids_to_text(token_ids, tokenizer):
  20. flat = token_ids.squeeze(0) # remove batch dimension
  21. return tokenizer.decode(flat.tolist())
  22. def download_and_load_gpt2(model_size, models_dir):
  23. # Validate model size
  24. allowed_sizes = ("124M", "355M", "774M", "1558M")
  25. if model_size not in allowed_sizes:
  26. raise ValueError(f"Model size not in {allowed_sizes}")
  27. # Define paths
  28. model_dir = os.path.join(models_dir, model_size)
  29. base_url = "https://openaipublic.blob.core.windows.net/gpt-2/models"
  30. filenames = [
  31. "checkpoint", "encoder.json", "hparams.json",
  32. "model.ckpt.data-00000-of-00001", "model.ckpt.index",
  33. "model.ckpt.meta", "vocab.bpe"
  34. ]
  35. # Download files
  36. os.makedirs(model_dir, exist_ok=True)
  37. for filename in filenames:
  38. file_url = os.path.join(base_url, model_size, filename)
  39. file_path = os.path.join(model_dir, filename)
  40. download_file(file_url, file_path)
  41. # Load hparams and params
  42. tf_ckpt_path = tf.train.latest_checkpoint(model_dir)
  43. hparams = json.load(open(os.path.join(model_dir, "hparams.json")))
  44. params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, hparams)
  45. return hparams, params
  46. def download_file(url, destination):
  47. # Send a GET request to download the file in streaming mode
  48. response = requests.get(url, stream=True)
  49. # Get the total file size from headers, defaulting to 0 if not present
  50. file_size = int(response.headers.get("content-length", 0))
  51. # Check if file exists and has the same size
  52. if os.path.exists(destination):
  53. file_size_local = os.path.getsize(destination)
  54. if file_size == file_size_local:
  55. print(f"File already exists and is up-to-date: {destination}")
  56. return
  57. # Define the block size for reading the file
  58. block_size = 1024 # 1 Kilobyte
  59. # Initialize the progress bar with total file size
  60. progress_bar_description = url.split("/")[-1] # Extract filename from URL
  61. with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
  62. # Open the destination file in binary write mode
  63. with open(destination, "wb") as file:
  64. # Iterate over the file data in chunks
  65. for chunk in response.iter_content(block_size):
  66. progress_bar.update(len(chunk)) # Update progress bar
  67. file.write(chunk) # Write the chunk to the file
  68. def load_gpt2_params_from_tf_ckpt(ckpt_path, hparams):
  69. # Initialize parameters dictionary with empty blocks for each layer
  70. params = {"blocks": [{} for _ in range(hparams["n_layer"])]}
  71. # Iterate over each variable in the checkpoint
  72. for name, _ in tf.train.list_variables(ckpt_path):
  73. # Load the variable and remove singleton dimensions
  74. variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name))
  75. # Process the variable name to extract relevant parts
  76. variable_name_parts = name.split("/")[1:] # Skip the 'model/' prefix
  77. # Identify the target dictionary for the variable
  78. target_dict = params
  79. if variable_name_parts[0].startswith("h"):
  80. layer_number = int(variable_name_parts[0][1:])
  81. target_dict = params["blocks"][layer_number]
  82. # Recursively access or create nested dictionaries
  83. for key in variable_name_parts[1:-1]:
  84. target_dict = target_dict.setdefault(key, {})
  85. # Assign the variable array to the last key
  86. last_key = variable_name_parts[-1]
  87. target_dict[last_key] = variable_array
  88. return params
  89. def assign(left, right):
  90. if left.shape != right.shape:
  91. raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
  92. return torch.nn.Parameter(torch.tensor(right))
  93. def load_weights_into_gpt(gpt, params):
  94. # Weight tying
  95. gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
  96. gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])
  97. for b in range(len(params["blocks"])):
  98. q_w, k_w, v_w = np.split((params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
  99. gpt.trf_blocks[b].att.W_query.weight = assign(gpt.trf_blocks[b].att.W_query.weight, q_w.T)
  100. gpt.trf_blocks[b].att.W_key.weight = assign(gpt.trf_blocks[b].att.W_key.weight, k_w.T)
  101. gpt.trf_blocks[b].att.W_value.weight = assign(gpt.trf_blocks[b].att.W_value.weight, v_w.T)
  102. q_b, k_b, v_b = np.split((params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
  103. gpt.trf_blocks[b].att.W_query.bias = assign(gpt.trf_blocks[b].att.W_query.bias, q_b)
  104. gpt.trf_blocks[b].att.W_key.bias = assign(gpt.trf_blocks[b].att.W_key.bias, k_b)
  105. gpt.trf_blocks[b].att.W_value.bias = assign(gpt.trf_blocks[b].att.W_value.bias, v_b)
  106. gpt.trf_blocks[b].att.out_proj.weight = assign(gpt.trf_blocks[b].att.out_proj.weight, params["blocks"][b]["attn"]["c_proj"]["w"].T)
  107. gpt.trf_blocks[b].att.out_proj.bias = assign(gpt.trf_blocks[b].att.out_proj.bias, params["blocks"][b]["attn"]["c_proj"]["b"])
  108. gpt.trf_blocks[b].ff.layers[0].weight = assign(gpt.trf_blocks[b].ff.layers[0].weight, params["blocks"][b]["mlp"]["c_fc"]["w"].T)
  109. gpt.trf_blocks[b].ff.layers[0].bias = assign(gpt.trf_blocks[b].ff.layers[0].bias, params["blocks"][b]["mlp"]["c_fc"]["b"])
  110. gpt.trf_blocks[b].ff.layers[2].weight = assign(gpt.trf_blocks[b].ff.layers[2].weight, params["blocks"][b]["mlp"]["c_proj"]["w"].T)
  111. gpt.trf_blocks[b].ff.layers[2].bias = assign(gpt.trf_blocks[b].ff.layers[2].bias, params["blocks"][b]["mlp"]["c_proj"]["b"])
  112. gpt.trf_blocks[b].norm1.scale = assign(gpt.trf_blocks[b].norm1.scale, params["blocks"][b]["ln_1"]["g"])
  113. gpt.trf_blocks[b].norm1.shift = assign(gpt.trf_blocks[b].norm1.shift, params["blocks"][b]["ln_1"]["b"])
  114. gpt.trf_blocks[b].norm2.scale = assign(gpt.trf_blocks[b].norm2.scale, params["blocks"][b]["ln_2"]["g"])
  115. gpt.trf_blocks[b].norm2.shift = assign(gpt.trf_blocks[b].norm2.shift, params["blocks"][b]["ln_2"]["b"])
  116. gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
  117. gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
  118. gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])
  119. def generate(model, idx, max_new_tokens, context_size, temperature, top_k=None):
  120. # For-loop is the same as before: Get logits, and only focus on last time step
  121. for _ in range(max_new_tokens):
  122. idx_cond = idx[:, -context_size:]
  123. with torch.no_grad():
  124. logits = model(idx_cond)
  125. logits = logits[:, -1, :]
  126. # New: Filter logits with top_k sampling
  127. if top_k is not None:
  128. # Keep only top_k values
  129. top_logits, _ = torch.topk(logits, top_k)
  130. min_val = top_logits[:, -1]
  131. logits = torch.where(logits < min_val, torch.tensor(float('-inf')).to(logits.device), logits)
  132. # New: Apply temperature scaling
  133. if temperature > 0.0:
  134. logits = logits / temperature
  135. # Apply softmax to get probabilities
  136. probs = torch.softmax(logits, dim=-1) # (batch_size, context_len)
  137. # Sample from the distribution
  138. idx_next = torch.multinomial(probs, num_samples=1) # (batch_size, 1)
  139. # Otherwise same as before: get idx of the vocab entry with the highest logits value
  140. else:
  141. idx_next = torch.argmax(logits, dim=-1, keepdim=True) # (batch_size, 1)
  142. # Same as before: append sampled index to the running sequence
  143. idx = torch.cat((idx, idx_next), dim=1) # (batch_size, num_tokens+1)
  144. return idx
  145. def main(gpt_config, input_prompt, model_size):
  146. device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  147. hparams, params = download_and_load_gpt2(model_size=model_size, models_dir="gpt2")
  148. gpt = GPTModel(gpt_config)
  149. load_weights_into_gpt(gpt, params)
  150. gpt.to(device)
  151. gpt.eval()
  152. tokenizer = tiktoken.get_encoding("gpt2")
  153. token_ids = generate(
  154. model=gpt,
  155. idx=text_to_token_ids(input_prompt, tokenizer),
  156. max_new_tokens=30,
  157. context_size=gpt_config["ctx_len"],
  158. top_k=1,
  159. temperature=1.0
  160. )
  161. print("Output text:\n", token_ids_to_text(token_ids, tokenizer))
  162. if __name__ == "__main__":
  163. torch.manual_seed(123)
  164. CHOOSE_MODEL = "gpt2-small"
  165. INPUT_PROMPT = "Every effort moves"
  166. BASE_CONFIG = {
  167. "vocab_size": 50257, # Vocabulary size
  168. "ctx_len": 1024, # Context length
  169. "drop_rate": 0.0, # Dropout rate
  170. "qkv_bias": True # Query-key-value bias
  171. }
  172. model_configs = {
  173. "gpt2-small": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
  174. "gpt2-medium": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
  175. "gpt2-large": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
  176. "gpt2-xl": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
  177. }
  178. model_sizes = {
  179. "gpt2-small": "124M",
  180. "gpt2-medium": "355M",
  181. "gpt2-large": "774M",
  182. "gpt2-xl": "1558"
  183. }
  184. BASE_CONFIG.update(model_configs[CHOOSE_MODEL])
  185. main(BASE_CONFIG, INPUT_PROMPT, model_sizes[CHOOSE_MODEL])