exercise_experiments.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502
  1. # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
  2. # Source for "Build a Large Language Model From Scratch"
  3. # - https://www.manning.com/books/build-a-large-language-model-from-scratch
  4. # Code: https://github.com/rasbt/LLMs-from-scratch
  5. #
  6. # Code to run the exercises; see exercise-solutions.ipynb for more information
  7. from functools import partial
  8. from importlib.metadata import version
  9. import json
  10. import os
  11. import re
  12. import time
  13. import urllib
  14. import matplotlib.pyplot as plt
  15. import tiktoken
  16. import torch
  17. from torch.utils.data import Dataset, DataLoader
  18. from tqdm import tqdm
  19. # Import from local files in this folder
  20. from gpt_download import download_and_load_gpt2
  21. from previous_chapters import (
  22. calc_loss_loader,
  23. generate,
  24. GPTModel,
  25. load_weights_into_gpt,
  26. text_to_token_ids,
  27. train_model_simple,
  28. token_ids_to_text
  29. )
  30. class InstructionDataset(Dataset):
  31. def __init__(self, data, tokenizer):
  32. self.data = data
  33. # Pre-tokenize texts
  34. self.encoded_texts = []
  35. for entry in data:
  36. instruction_plus_input = format_input(entry)
  37. response_text = f"\n\n### Response:\n{entry['output']}"
  38. full_text = instruction_plus_input + response_text
  39. self.encoded_texts.append(
  40. tokenizer.encode(full_text)
  41. )
  42. def __getitem__(self, index):
  43. return self.encoded_texts[index]
  44. def __len__(self):
  45. return len(self.data)
  46. class InstructionDatasetWithMasking(Dataset):
  47. def __init__(self, data, tokenizer):
  48. self.data = data
  49. # New: Separate list for instruction lengths
  50. self.instruction_lengths = []
  51. self.encoded_texts = []
  52. for entry in data:
  53. instruction_plus_input = format_input(entry)
  54. response_text = f"\n\n### Response:\n{entry['output']}"
  55. full_text = instruction_plus_input + response_text
  56. self.encoded_texts.append(
  57. tokenizer.encode(full_text)
  58. )
  59. # New: collect instruction lengths
  60. instruction_length = len(tokenizer.encode(instruction_plus_input))
  61. self.instruction_lengths.append(instruction_length)
  62. def __getitem__(self, index):
  63. # New: return both instruction lengths and texts separately
  64. return self.instruction_lengths[index], self.encoded_texts[index]
  65. def __len__(self):
  66. return len(self.data)
  67. class InstructionDatasetPhi(Dataset):
  68. def __init__(self, data, tokenizer):
  69. self.data = data
  70. # Pre-tokenize texts
  71. self.encoded_texts = []
  72. for entry in data:
  73. ###################################################################
  74. # NEW: Use `format_input_phi` and adjust the response text template
  75. instruction_plus_input = format_input_phi(entry)
  76. response_text = f"\n<|assistant|>:\n{entry['output']}"
  77. ###################################################################
  78. full_text = instruction_plus_input + response_text
  79. self.encoded_texts.append(
  80. tokenizer.encode(full_text)
  81. )
  82. def __getitem__(self, index):
  83. return self.encoded_texts[index]
  84. def __len__(self):
  85. return len(self.data)
  86. def custom_collate_fn(
  87. batch,
  88. pad_token_id=50256,
  89. ignore_index=-100,
  90. allowed_max_length=None,
  91. device="cpu"
  92. ):
  93. # Find the longest sequence in the batch
  94. batch_max_length = max(len(item)+1 for item in batch)
  95. # Pad and prepare inputs and targets
  96. inputs_lst, targets_lst = [], []
  97. for item in batch:
  98. new_item = item.copy()
  99. # Add an <|endoftext|> token
  100. new_item += [pad_token_id]
  101. # Pad sequences to max_length
  102. padded = new_item + [pad_token_id] * (batch_max_length - len(new_item))
  103. inputs = torch.tensor(padded[:-1]) # Truncate the last token for inputs
  104. targets = torch.tensor(padded[1:]) # Shift +1 to the right for targets
  105. # New: Replace all but the first padding tokens in targets by ignore_index
  106. mask = targets == pad_token_id
  107. indices = torch.nonzero(mask).squeeze()
  108. if indices.numel() > 1:
  109. targets[indices[1:]] = ignore_index
  110. # New: Optionally truncate to maximum sequence length
  111. if allowed_max_length is not None:
  112. inputs = inputs[:allowed_max_length]
  113. targets = targets[:allowed_max_length]
  114. inputs_lst.append(inputs)
  115. targets_lst.append(targets)
  116. # Convert list of inputs and targets to tensors and transfer to target device
  117. inputs_tensor = torch.stack(inputs_lst).to(device)
  118. targets_tensor = torch.stack(targets_lst).to(device)
  119. return inputs_tensor, targets_tensor
  120. def custom_collate_with_masking_fn(
  121. batch,
  122. pad_token_id=50256,
  123. ignore_index=-100,
  124. allowed_max_length=None,
  125. device="cpu"
  126. ):
  127. # Find the longest sequence in the batch
  128. batch_max_length = max(len(item)+1 for instruction_length, item in batch) # New: batch is now a tuple
  129. # Pad and prepare inputs and targets
  130. inputs_lst, targets_lst = [], []
  131. for instruction_length, item in batch: # New: batch is now a tuple
  132. new_item = item.copy()
  133. # Add an <|endoftext|> token
  134. new_item += [pad_token_id]
  135. # Pad sequences to max_length
  136. padded = new_item + [pad_token_id] * (batch_max_length - len(new_item))
  137. inputs = torch.tensor(padded[:-1]) # Truncate the last token for inputs
  138. targets = torch.tensor(padded[1:]) # Shift +1 to the right for targets
  139. # Replace all but the first padding tokens in targets by ignore_index
  140. mask = targets == pad_token_id
  141. indices = torch.nonzero(mask).squeeze()
  142. if indices.numel() > 1:
  143. targets[indices[1:]] = ignore_index
  144. # New: Mask all input and instruction tokens in the targets
  145. targets[:instruction_length-1] = -100
  146. # Optionally truncate to maximum sequence length
  147. if allowed_max_length is not None:
  148. inputs = inputs[:allowed_max_length]
  149. targets = targets[:allowed_max_length]
  150. inputs_lst.append(inputs)
  151. targets_lst.append(targets)
  152. # Convert list of inputs and targets to tensors and transfer to target device
  153. inputs_tensor = torch.stack(inputs_lst).to(device)
  154. targets_tensor = torch.stack(targets_lst).to(device)
  155. return inputs_tensor, targets_tensor
  156. def download_and_load_file(file_path, url):
  157. if not os.path.exists(file_path):
  158. with urllib.request.urlopen(url) as response:
  159. text_data = response.read().decode("utf-8")
  160. with open(file_path, "w", encoding="utf-8") as file:
  161. file.write(text_data)
  162. else:
  163. with open(file_path, "r", encoding="utf-8") as file:
  164. text_data = file.read()
  165. with open(file_path, "r") as file:
  166. data = json.load(file)
  167. return data
  168. def format_input_phi(entry):
  169. instruction_text = (
  170. f"<|user|>\n{entry['instruction']}"
  171. )
  172. input_text = f"\n{entry['input']}" if entry["input"] else ""
  173. return instruction_text + input_text
  174. def format_input(entry):
  175. instruction_text = (
  176. f"Below is an instruction that describes a task. "
  177. f"Write a response that appropriately completes the request."
  178. f"\n\n### Instruction:\n{entry['instruction']}"
  179. )
  180. input_text = f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
  181. return instruction_text + input_text
  182. def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses, plot_name):
  183. fig, ax1 = plt.subplots(figsize=(12, 6))
  184. # Plot training and validation loss against epochs
  185. ax1.plot(epochs_seen, train_losses, label="Training loss")
  186. ax1.plot(epochs_seen, val_losses, linestyle="-.", label="Validation loss")
  187. ax1.set_xlabel("Epochs")
  188. ax1.set_ylabel("Loss")
  189. ax1.legend(loc="upper right")
  190. # Create a second x-axis for tokens seen
  191. ax2 = ax1.twiny() # Create a second x-axis that shares the same y-axis
  192. ax2.plot(tokens_seen, train_losses, alpha=0) # Invisible plot for aligning ticks
  193. ax2.set_xlabel("Tokens seen")
  194. fig.tight_layout() # Adjust layout to make room
  195. print(f"Plot saved as {plot_name}")
  196. plt.savefig(plot_name)
  197. # plt.show()
  198. def main(mask_instructions=False, alpaca52k=False, phi3_prompt=False):
  199. #######################################
  200. # Print package versions
  201. #######################################
  202. print()
  203. pkgs = [
  204. "matplotlib", # Plotting library
  205. "tiktoken", # Tokenizer
  206. "torch", # Deep learning library
  207. "tqdm", # Progress bar
  208. "tensorflow", # For OpenAI's pretrained weights
  209. ]
  210. for p in pkgs:
  211. print(f"{p} version: {version(p)}")
  212. print(50*"-")
  213. #######################################
  214. # Download and prepare dataset
  215. #######################################
  216. file_path = "instruction-data.json"
  217. if alpaca52k:
  218. url = "https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json"
  219. else:
  220. url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch07/01_main-chapter-code/instruction-data.json"
  221. data = download_and_load_file(file_path, url)
  222. train_portion = int(len(data) * 0.85) # 85% for training
  223. test_portion = int(len(data) * 0.1) # 10% for testing
  224. train_data = data[:train_portion]
  225. test_data = data[train_portion:train_portion + test_portion]
  226. val_data = data[train_portion + test_portion:]
  227. print("Training set length:", len(train_data))
  228. print("Validation set length:", len(val_data))
  229. print("Test set length:", len(test_data))
  230. print(50*"-")
  231. tokenizer = tiktoken.get_encoding("gpt2")
  232. device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  233. print("Device:", device)
  234. print(50*"-")
  235. if alpaca52k:
  236. allowed_max_length = 512
  237. else:
  238. allowed_max_length = 1024
  239. if mask_instructions and phi3_prompt:
  240. raise ValueError("Simultaneous support for instruction masking and the Phi-3 prompt template has not been implemented, yet.")
  241. if mask_instructions:
  242. customized_collate_fn = partial(custom_collate_with_masking_fn, device=device, allowed_max_length=allowed_max_length)
  243. CustomDataset = InstructionDatasetWithMasking
  244. elif phi3_prompt:
  245. customized_collate_fn = partial(custom_collate_fn, device=device, allowed_max_length=allowed_max_length)
  246. CustomDataset = InstructionDatasetPhi
  247. else:
  248. customized_collate_fn = partial(custom_collate_fn, device=device, allowed_max_length=allowed_max_length)
  249. CustomDataset = InstructionDataset
  250. num_workers = 0
  251. if alpaca52k:
  252. batch_size = 4
  253. else:
  254. batch_size = 8
  255. torch.manual_seed(123)
  256. train_dataset = CustomDataset(train_data, tokenizer)
  257. train_loader = DataLoader(
  258. train_dataset,
  259. batch_size=batch_size,
  260. collate_fn=customized_collate_fn,
  261. shuffle=True,
  262. drop_last=True,
  263. num_workers=num_workers
  264. )
  265. val_dataset = CustomDataset(val_data, tokenizer)
  266. val_loader = DataLoader(
  267. val_dataset,
  268. batch_size=batch_size,
  269. collate_fn=customized_collate_fn,
  270. shuffle=False,
  271. drop_last=False,
  272. num_workers=num_workers
  273. )
  274. #######################################
  275. # Load pretrained model
  276. #######################################
  277. BASE_CONFIG = {
  278. "vocab_size": 50257, # Vocabulary size
  279. "context_length": 1024, # Context length
  280. "drop_rate": 0.0, # Dropout rate
  281. "qkv_bias": True # Query-key-value bias
  282. }
  283. model_configs = {
  284. "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
  285. "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
  286. "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
  287. "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
  288. }
  289. CHOOSE_MODEL = "gpt2-medium (355M)"
  290. BASE_CONFIG.update(model_configs[CHOOSE_MODEL])
  291. model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
  292. settings, params = download_and_load_gpt2(model_size=model_size, models_dir="gpt2")
  293. model = GPTModel(BASE_CONFIG)
  294. load_weights_into_gpt(model, params)
  295. model.eval()
  296. model.to(device)
  297. print("Loaded model:", CHOOSE_MODEL)
  298. print(50*"-")
  299. #######################################
  300. # Finetuning the model
  301. #######################################
  302. print("Initial losses")
  303. with torch.no_grad():
  304. train_loss = calc_loss_loader(train_loader, model, device, num_batches=5)
  305. val_loss = calc_loss_loader(val_loader, model, device, num_batches=5)
  306. print(" Training loss:", train_loss)
  307. print(" Validation loss:", val_loss)
  308. start_time = time.time()
  309. num_epochs = 2
  310. optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.1)
  311. torch.manual_seed(123)
  312. start_context = format_input_phi(val_data[0]) if phi3_prompt else format_input(val_data[0])
  313. train_losses, val_losses, tokens_seen = train_model_simple(
  314. model, train_loader, val_loader, optimizer, device,
  315. num_epochs=num_epochs, eval_freq=5, eval_iter=5,
  316. start_context=start_context, tokenizer=tokenizer
  317. )
  318. end_time = time.time()
  319. execution_time_minutes = (end_time - start_time) / 60
  320. print(f"Training completed in {execution_time_minutes:.2f} minutes.")
  321. epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
  322. plot_name = "loss-plot.pdf"
  323. if mask_instructions:
  324. plot_name = plot_name.replace(".pdf", "-mask-instructions.pdf")
  325. if alpaca52k:
  326. plot_name = plot_name.replace(".pdf", "-alpaca52k.pdf")
  327. if phi3_prompt:
  328. plot_name = plot_name.replace(".pdf", "-phi3-prompt.pdf")
  329. if not any([mask_instructions, alpaca52k, phi3_prompt]):
  330. plot_name = plot_name.replace(".pdf", "-baseline.pdf")
  331. plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses, plot_name)
  332. print(50*"-")
  333. #######################################
  334. # Saving results
  335. #######################################
  336. print("Generating responses")
  337. for i, entry in tqdm(enumerate(test_data), total=len(test_data)):
  338. input_text = format_input_phi(entry) if phi3_prompt else format_input(entry)
  339. token_ids = generate(
  340. model=model,
  341. idx=text_to_token_ids(input_text, tokenizer).to(device),
  342. max_new_tokens=256,
  343. context_size=BASE_CONFIG["context_length"],
  344. eos_id=50256
  345. )
  346. generated_text = token_ids_to_text(token_ids, tokenizer)
  347. if phi3_prompt:
  348. response_text = generated_text[len(input_text):].replace("<|assistant|>:", "").strip()
  349. else:
  350. response_text = generated_text[len(input_text):].replace("### Response:", "").strip()
  351. test_data[i]["model_response"] = response_text
  352. test_data_path = "instruction-data-with-response.json"
  353. file_name = f"{re.sub(r'[ ()]', '', CHOOSE_MODEL) }-sft.pth"
  354. if mask_instructions:
  355. test_data_path = test_data_path.replace(".json", "-mask-instructions.json")
  356. file_name = file_name.replace(".pth", "-mask-instructions.pth")
  357. if alpaca52k:
  358. test_data_path = test_data_path.replace(".json", "-alpaca52k.json")
  359. file_name = file_name.replace(".pth", "-alpaca52k.pth")
  360. if phi3_prompt:
  361. test_data_path = test_data_path.replace(".json", "-phi3-prompt.json")
  362. file_name = file_name.replace(".pth", "-phi3-prompt.pth")
  363. if not any([mask_instructions, alpaca52k, phi3_prompt]):
  364. test_data_path = test_data_path.replace(".json", "-baseline.json")
  365. file_name = file_name.replace(".pth", "-baseline.pth")
  366. with open(test_data_path, "w") as file:
  367. json.dump(test_data, file, indent=4) # "indent" for pretty-printing
  368. print(f"Responses saved as {test_data_path}")
  369. torch.save(model.state_dict(), file_name)
  370. print(f"Model saved as {file_name}")
  371. if __name__ == "__main__":
  372. import argparse
  373. parser = argparse.ArgumentParser(
  374. description="Instruction finetune a GPT model"
  375. )
  376. options = {"baseline", "mask_instructions", "alpaca_52k", "phi3_prompt"}
  377. parser.add_argument(
  378. "--exercise_solution",
  379. type=str,
  380. default="last_block",
  381. help=(
  382. f"Which experiment to run. Options: {options}."
  383. )
  384. )
  385. args = parser.parse_args()
  386. if args.exercise_solution == "baseline":
  387. main()
  388. elif args.exercise_solution == "mask_instructions":
  389. main(mask_instructions=True)
  390. elif args.exercise_solution == "alpaca_52k":
  391. main(alpaca52k=True)
  392. elif args.exercise_solution == "phi3_prompt":
  393. main(phi3_prompt=True)
  394. else:
  395. raise ValueError(f"{args.exercise_solution} is not a valid --args.exercise_solution option. Options: {options}")