exercise_experiments.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560
  1. # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
  2. # Source for "Build a Large Language Model From Scratch"
  3. # - https://www.manning.com/books/build-a-large-language-model-from-scratch
  4. # Code: https://github.com/rasbt/LLMs-from-scratch
  5. #
  6. # Code to run the exercises; see exercise-solutions.ipynb for more information
  7. from functools import partial
  8. from importlib.metadata import version
  9. import json
  10. import math
  11. import os
  12. import re
  13. import time
  14. import urllib
  15. import matplotlib.pyplot as plt
  16. import tiktoken
  17. import torch
  18. from torch.utils.data import Dataset, DataLoader
  19. from tqdm import tqdm
  20. # Import from local files in this folder
  21. from gpt_download import download_and_load_gpt2
  22. from previous_chapters import (
  23. calc_loss_loader,
  24. generate,
  25. GPTModel,
  26. load_weights_into_gpt,
  27. text_to_token_ids,
  28. train_model_simple,
  29. token_ids_to_text
  30. )
  31. class InstructionDataset(Dataset):
  32. def __init__(self, data, tokenizer):
  33. self.data = data
  34. # Pre-tokenize texts
  35. self.encoded_texts = []
  36. for entry in data:
  37. instruction_plus_input = format_input(entry)
  38. response_text = f"\n\n### Response:\n{entry['output']}"
  39. full_text = instruction_plus_input + response_text
  40. self.encoded_texts.append(
  41. tokenizer.encode(full_text)
  42. )
  43. def __getitem__(self, index):
  44. return self.encoded_texts[index]
  45. def __len__(self):
  46. return len(self.data)
  47. class InstructionDatasetWithMasking(Dataset):
  48. def __init__(self, data, tokenizer):
  49. self.data = data
  50. # New: Separate list for instruction lengths
  51. self.instruction_lengths = []
  52. self.encoded_texts = []
  53. for entry in data:
  54. instruction_plus_input = format_input(entry)
  55. response_text = f"\n\n### Response:\n{entry['output']}"
  56. full_text = instruction_plus_input + response_text
  57. self.encoded_texts.append(
  58. tokenizer.encode(full_text)
  59. )
  60. # New: collect instruction lengths
  61. instruction_length = len(tokenizer.encode(instruction_plus_input))
  62. self.instruction_lengths.append(instruction_length)
  63. def __getitem__(self, index):
  64. # New: return both instruction lengths and texts separately
  65. return self.instruction_lengths[index], self.encoded_texts[index]
  66. def __len__(self):
  67. return len(self.data)
  68. class InstructionDatasetPhi(Dataset):
  69. def __init__(self, data, tokenizer):
  70. self.data = data
  71. # Pre-tokenize texts
  72. self.encoded_texts = []
  73. for entry in data:
  74. ###################################################################
  75. # NEW: Use `format_input_phi` and adjust the response text template
  76. instruction_plus_input = format_input_phi(entry)
  77. response_text = f"\n<|assistant|>:\n{entry['output']}"
  78. ###################################################################
  79. full_text = instruction_plus_input + response_text
  80. self.encoded_texts.append(
  81. tokenizer.encode(full_text)
  82. )
  83. def __getitem__(self, index):
  84. return self.encoded_texts[index]
  85. def __len__(self):
  86. return len(self.data)
  87. class LinearWithLoRA(torch.nn.Module):
  88. def __init__(self, linear, rank, alpha):
  89. super().__init__()
  90. self.linear = linear
  91. self.lora = LoRALayer(
  92. linear.in_features, linear.out_features, rank, alpha
  93. )
  94. def forward(self, x):
  95. return self.linear(x) + self.lora(x)
  96. class LoRALayer(torch.nn.Module):
  97. def __init__(self, in_dim, out_dim, rank, alpha):
  98. super().__init__()
  99. self.A = torch.nn.Parameter(torch.empty(in_dim, rank))
  100. torch.nn.init.kaiming_uniform_(self.A, a=math.sqrt(5)) # similar to standard weight initialization
  101. self.B = torch.nn.Parameter(torch.zeros(rank, out_dim))
  102. self.alpha = alpha
  103. def forward(self, x):
  104. x = self.alpha * (x @ self.A @ self.B)
  105. return x
  106. def replace_linear_with_lora(model, rank, alpha):
  107. for name, module in model.named_children():
  108. if isinstance(module, torch.nn.Linear):
  109. # Replace the Linear layer with LinearWithLoRA
  110. setattr(model, name, LinearWithLoRA(module, rank, alpha))
  111. else:
  112. # Recursively apply the same function to child modules
  113. replace_linear_with_lora(module, rank, alpha)
  114. def custom_collate_fn(
  115. batch,
  116. pad_token_id=50256,
  117. ignore_index=-100,
  118. allowed_max_length=None,
  119. device="cpu"
  120. ):
  121. # Find the longest sequence in the batch
  122. batch_max_length = max(len(item)+1 for item in batch)
  123. # Pad and prepare inputs and targets
  124. inputs_lst, targets_lst = [], []
  125. for item in batch:
  126. new_item = item.copy()
  127. # Add an <|endoftext|> token
  128. new_item += [pad_token_id]
  129. # Pad sequences to max_length
  130. padded = new_item + [pad_token_id] * (batch_max_length - len(new_item))
  131. inputs = torch.tensor(padded[:-1]) # Truncate the last token for inputs
  132. targets = torch.tensor(padded[1:]) # Shift +1 to the right for targets
  133. # New: Replace all but the first padding tokens in targets by ignore_index
  134. mask = targets == pad_token_id
  135. indices = torch.nonzero(mask).squeeze()
  136. if indices.numel() > 1:
  137. targets[indices[1:]] = ignore_index
  138. # New: Optionally truncate to maximum sequence length
  139. if allowed_max_length is not None:
  140. inputs = inputs[:allowed_max_length]
  141. targets = targets[:allowed_max_length]
  142. inputs_lst.append(inputs)
  143. targets_lst.append(targets)
  144. # Convert list of inputs and targets to tensors and transfer to target device
  145. inputs_tensor = torch.stack(inputs_lst).to(device)
  146. targets_tensor = torch.stack(targets_lst).to(device)
  147. return inputs_tensor, targets_tensor
  148. def custom_collate_with_masking_fn(
  149. batch,
  150. pad_token_id=50256,
  151. ignore_index=-100,
  152. allowed_max_length=None,
  153. device="cpu"
  154. ):
  155. # Find the longest sequence in the batch
  156. batch_max_length = max(len(item)+1 for instruction_length, item in batch) # New: batch is now a tuple
  157. # Pad and prepare inputs and targets
  158. inputs_lst, targets_lst = [], []
  159. for instruction_length, item in batch: # New: batch is now a tuple
  160. new_item = item.copy()
  161. # Add an <|endoftext|> token
  162. new_item += [pad_token_id]
  163. # Pad sequences to max_length
  164. padded = new_item + [pad_token_id] * (batch_max_length - len(new_item))
  165. inputs = torch.tensor(padded[:-1]) # Truncate the last token for inputs
  166. targets = torch.tensor(padded[1:]) # Shift +1 to the right for targets
  167. # Replace all but the first padding tokens in targets by ignore_index
  168. mask = targets == pad_token_id
  169. indices = torch.nonzero(mask).squeeze()
  170. if indices.numel() > 1:
  171. targets[indices[1:]] = ignore_index
  172. # New: Mask all input and instruction tokens in the targets
  173. targets[:instruction_length-1] = -100
  174. # Optionally truncate to maximum sequence length
  175. if allowed_max_length is not None:
  176. inputs = inputs[:allowed_max_length]
  177. targets = targets[:allowed_max_length]
  178. inputs_lst.append(inputs)
  179. targets_lst.append(targets)
  180. # Convert list of inputs and targets to tensors and transfer to target device
  181. inputs_tensor = torch.stack(inputs_lst).to(device)
  182. targets_tensor = torch.stack(targets_lst).to(device)
  183. return inputs_tensor, targets_tensor
  184. def download_and_load_file(file_path, url):
  185. if not os.path.exists(file_path):
  186. with urllib.request.urlopen(url) as response:
  187. text_data = response.read().decode("utf-8")
  188. with open(file_path, "w", encoding="utf-8") as file:
  189. file.write(text_data)
  190. else:
  191. with open(file_path, "r", encoding="utf-8") as file:
  192. text_data = file.read()
  193. with open(file_path, "r") as file:
  194. data = json.load(file)
  195. return data
  196. def format_input_phi(entry):
  197. instruction_text = (
  198. f"<|user|>\n{entry['instruction']}"
  199. )
  200. input_text = f"\n{entry['input']}" if entry["input"] else ""
  201. return instruction_text + input_text
  202. def format_input(entry):
  203. instruction_text = (
  204. f"Below is an instruction that describes a task. "
  205. f"Write a response that appropriately completes the request."
  206. f"\n\n### Instruction:\n{entry['instruction']}"
  207. )
  208. input_text = f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
  209. return instruction_text + input_text
  210. def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses, plot_name):
  211. fig, ax1 = plt.subplots(figsize=(12, 6))
  212. # Plot training and validation loss against epochs
  213. ax1.plot(epochs_seen, train_losses, label="Training loss")
  214. ax1.plot(epochs_seen, val_losses, linestyle="-.", label="Validation loss")
  215. ax1.set_xlabel("Epochs")
  216. ax1.set_ylabel("Loss")
  217. ax1.legend(loc="upper right")
  218. # Create a second x-axis for tokens seen
  219. ax2 = ax1.twiny() # Create a second x-axis that shares the same y-axis
  220. ax2.plot(tokens_seen, train_losses, alpha=0) # Invisible plot for aligning ticks
  221. ax2.set_xlabel("Tokens seen")
  222. fig.tight_layout() # Adjust layout to make room
  223. print(f"Plot saved as {plot_name}")
  224. plt.savefig(plot_name)
  225. # plt.show()
  226. def main(mask_instructions=False, alpaca52k=False, phi3_prompt=False, lora=False):
  227. #######################################
  228. # Print package versions
  229. #######################################
  230. print()
  231. pkgs = [
  232. "matplotlib", # Plotting library
  233. "tiktoken", # Tokenizer
  234. "torch", # Deep learning library
  235. "tqdm", # Progress bar
  236. "tensorflow", # For OpenAI's pretrained weights
  237. ]
  238. for p in pkgs:
  239. print(f"{p} version: {version(p)}")
  240. print(50*"-")
  241. #######################################
  242. # Download and prepare dataset
  243. #######################################
  244. file_path = "instruction-data.json"
  245. if alpaca52k:
  246. url = "https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json"
  247. else:
  248. url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch07/01_main-chapter-code/instruction-data.json"
  249. data = download_and_load_file(file_path, url)
  250. train_portion = int(len(data) * 0.85) # 85% for training
  251. test_portion = int(len(data) * 0.1) # 10% for testing
  252. train_data = data[:train_portion]
  253. test_data = data[train_portion:train_portion + test_portion]
  254. val_data = data[train_portion + test_portion:]
  255. print("Training set length:", len(train_data))
  256. print("Validation set length:", len(val_data))
  257. print("Test set length:", len(test_data))
  258. print(50*"-")
  259. tokenizer = tiktoken.get_encoding("gpt2")
  260. device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  261. print("Device:", device)
  262. print(50*"-")
  263. if alpaca52k:
  264. allowed_max_length = 512
  265. else:
  266. allowed_max_length = 1024
  267. if mask_instructions and phi3_prompt:
  268. raise ValueError("Simultaneous support for instruction masking and the Phi-3 prompt template has not been implemented, yet.")
  269. if mask_instructions:
  270. customized_collate_fn = partial(custom_collate_with_masking_fn, device=device, allowed_max_length=allowed_max_length)
  271. CustomDataset = InstructionDatasetWithMasking
  272. elif phi3_prompt:
  273. customized_collate_fn = partial(custom_collate_fn, device=device, allowed_max_length=allowed_max_length)
  274. CustomDataset = InstructionDatasetPhi
  275. else:
  276. customized_collate_fn = partial(custom_collate_fn, device=device, allowed_max_length=allowed_max_length)
  277. CustomDataset = InstructionDataset
  278. num_workers = 0
  279. if alpaca52k:
  280. batch_size = 4
  281. else:
  282. batch_size = 8
  283. torch.manual_seed(123)
  284. train_dataset = CustomDataset(train_data, tokenizer)
  285. train_loader = DataLoader(
  286. train_dataset,
  287. batch_size=batch_size,
  288. collate_fn=customized_collate_fn,
  289. shuffle=True,
  290. drop_last=True,
  291. num_workers=num_workers
  292. )
  293. val_dataset = CustomDataset(val_data, tokenizer)
  294. val_loader = DataLoader(
  295. val_dataset,
  296. batch_size=batch_size,
  297. collate_fn=customized_collate_fn,
  298. shuffle=False,
  299. drop_last=False,
  300. num_workers=num_workers
  301. )
  302. #######################################
  303. # Load pretrained model
  304. #######################################
  305. BASE_CONFIG = {
  306. "vocab_size": 50257, # Vocabulary size
  307. "context_length": 1024, # Context length
  308. "drop_rate": 0.0, # Dropout rate
  309. "qkv_bias": True # Query-key-value bias
  310. }
  311. model_configs = {
  312. "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
  313. "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
  314. "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
  315. "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
  316. }
  317. CHOOSE_MODEL = "gpt2-medium (355M)"
  318. BASE_CONFIG.update(model_configs[CHOOSE_MODEL])
  319. model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
  320. settings, params = download_and_load_gpt2(model_size=model_size, models_dir="gpt2")
  321. model = GPTModel(BASE_CONFIG)
  322. load_weights_into_gpt(model, params)
  323. model.eval()
  324. model.to(device)
  325. print("Loaded model:", CHOOSE_MODEL)
  326. print(50*"-")
  327. if lora:
  328. total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
  329. print(f"Total trainable parameters before: {total_params:,}")
  330. for param in model.parameters():
  331. param.requires_grad = False
  332. total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
  333. print(f"Total trainable parameters after: {total_params:,}")
  334. replace_linear_with_lora(model, rank=16, alpha=16)
  335. total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
  336. print(f"Total trainable LoRA parameters: {total_params:,}")
  337. model.to(device)
  338. #######################################
  339. # Finetuning the model
  340. #######################################
  341. print("Initial losses")
  342. with torch.no_grad():
  343. train_loss = calc_loss_loader(train_loader, model, device, num_batches=5)
  344. val_loss = calc_loss_loader(val_loader, model, device, num_batches=5)
  345. print(" Training loss:", train_loss)
  346. print(" Validation loss:", val_loss)
  347. start_time = time.time()
  348. num_epochs = 2
  349. optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.1)
  350. torch.manual_seed(123)
  351. start_context = format_input_phi(val_data[0]) if phi3_prompt else format_input(val_data[0])
  352. train_losses, val_losses, tokens_seen = train_model_simple(
  353. model, train_loader, val_loader, optimizer, device,
  354. num_epochs=num_epochs, eval_freq=5, eval_iter=5,
  355. start_context=start_context, tokenizer=tokenizer
  356. )
  357. end_time = time.time()
  358. execution_time_minutes = (end_time - start_time) / 60
  359. print(f"Training completed in {execution_time_minutes:.2f} minutes.")
  360. epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
  361. plot_name = "loss-plot.pdf"
  362. if mask_instructions:
  363. plot_name = plot_name.replace(".pdf", "-mask-instructions.pdf")
  364. if alpaca52k:
  365. plot_name = plot_name.replace(".pdf", "-alpaca52k.pdf")
  366. if phi3_prompt:
  367. plot_name = plot_name.replace(".pdf", "-phi3-prompt.pdf")
  368. if lora:
  369. plot_name = plot_name.replace(".pdf", "-lora.pdf")
  370. if not any([mask_instructions, alpaca52k, phi3_prompt, lora]):
  371. plot_name = plot_name.replace(".pdf", "-baseline.pdf")
  372. plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses, plot_name)
  373. print(50*"-")
  374. #######################################
  375. # Saving results
  376. #######################################
  377. print("Generating responses")
  378. for i, entry in tqdm(enumerate(test_data), total=len(test_data)):
  379. input_text = format_input_phi(entry) if phi3_prompt else format_input(entry)
  380. token_ids = generate(
  381. model=model,
  382. idx=text_to_token_ids(input_text, tokenizer).to(device),
  383. max_new_tokens=256,
  384. context_size=BASE_CONFIG["context_length"],
  385. eos_id=50256
  386. )
  387. generated_text = token_ids_to_text(token_ids, tokenizer)
  388. if phi3_prompt:
  389. response_text = generated_text[len(input_text):].replace("<|assistant|>:", "").strip()
  390. else:
  391. response_text = generated_text[len(input_text):].replace("### Response:", "").strip()
  392. test_data[i]["model_response"] = response_text
  393. test_data_path = "instruction-data-with-response.json"
  394. file_name = f"{re.sub(r'[ ()]', '', CHOOSE_MODEL) }-sft.pth"
  395. if mask_instructions:
  396. test_data_path = test_data_path.replace(".json", "-mask-instructions.json")
  397. file_name = file_name.replace(".pth", "-mask-instructions.pth")
  398. if alpaca52k:
  399. test_data_path = test_data_path.replace(".json", "-alpaca52k.json")
  400. file_name = file_name.replace(".pth", "-alpaca52k.pth")
  401. if phi3_prompt:
  402. test_data_path = test_data_path.replace(".json", "-phi3-prompt.json")
  403. file_name = file_name.replace(".pth", "-phi3-prompt.pth")
  404. if lora:
  405. test_data_path = test_data_path.replace(".json", "-lora.json")
  406. file_name = file_name.replace(".pth", "-lora.pth")
  407. if not any([mask_instructions, alpaca52k, phi3_prompt, lora]):
  408. test_data_path = test_data_path.replace(".json", "-baseline.json")
  409. file_name = file_name.replace(".pth", "-baseline.pth")
  410. with open(test_data_path, "w") as file:
  411. json.dump(test_data, file, indent=4) # "indent" for pretty-printing
  412. print(f"Responses saved as {test_data_path}")
  413. torch.save(model.state_dict(), file_name)
  414. print(f"Model saved as {file_name}")
  415. if __name__ == "__main__":
  416. import argparse
  417. parser = argparse.ArgumentParser(
  418. description="Instruction finetune a GPT model"
  419. )
  420. options = {"baseline", "mask_instructions", "alpaca_52k", "phi3_prompt", "lora"}
  421. parser.add_argument(
  422. "--exercise_solution",
  423. type=str,
  424. default="last_block",
  425. help=(
  426. f"Which experiment to run. Options: {options}."
  427. )
  428. )
  429. args = parser.parse_args()
  430. if args.exercise_solution == "baseline":
  431. main()
  432. elif args.exercise_solution == "mask_instructions":
  433. main(mask_instructions=True)
  434. elif args.exercise_solution == "alpaca_52k":
  435. main(alpaca52k=True)
  436. elif args.exercise_solution == "phi3_prompt":
  437. main(phi3_prompt=True)
  438. elif args.exercise_solution == "lora":
  439. main(lora=True)
  440. else:
  441. raise ValueError(f"{args.exercise_solution} is not a valid --args.exercise_solution option. Options: {options}")