exercise_experiments.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562
  1. # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
  2. # Source for "Build a Large Language Model From Scratch"
  3. # - https://www.manning.com/books/build-a-large-language-model-from-scratch
  4. # Code: https://github.com/rasbt/LLMs-from-scratch
  5. #
  6. # Code to run the exercises; see exercise-solutions.ipynb for more information
  7. from functools import partial
  8. from importlib.metadata import version
  9. import json
  10. import math
  11. import os
  12. import re
  13. import time
  14. import urllib
  15. import matplotlib.pyplot as plt
  16. from matplotlib.ticker import MaxNLocator
  17. import tiktoken
  18. import torch
  19. from torch.utils.data import Dataset, DataLoader
  20. from tqdm import tqdm
  21. # Import from local files in this folder
  22. from gpt_download import download_and_load_gpt2
  23. from previous_chapters import (
  24. calc_loss_loader,
  25. generate,
  26. GPTModel,
  27. load_weights_into_gpt,
  28. text_to_token_ids,
  29. train_model_simple,
  30. token_ids_to_text
  31. )
  32. class InstructionDataset(Dataset):
  33. def __init__(self, data, tokenizer):
  34. self.data = data
  35. # Pre-tokenize texts
  36. self.encoded_texts = []
  37. for entry in data:
  38. instruction_plus_input = format_input(entry)
  39. response_text = f"\n\n### Response:\n{entry['output']}"
  40. full_text = instruction_plus_input + response_text
  41. self.encoded_texts.append(
  42. tokenizer.encode(full_text)
  43. )
  44. def __getitem__(self, index):
  45. return self.encoded_texts[index]
  46. def __len__(self):
  47. return len(self.data)
  48. class InstructionDatasetWithMasking(Dataset):
  49. def __init__(self, data, tokenizer):
  50. self.data = data
  51. # New: Separate list for instruction lengths
  52. self.instruction_lengths = []
  53. self.encoded_texts = []
  54. for entry in data:
  55. instruction_plus_input = format_input(entry)
  56. response_text = f"\n\n### Response:\n{entry['output']}"
  57. full_text = instruction_plus_input + response_text
  58. self.encoded_texts.append(
  59. tokenizer.encode(full_text)
  60. )
  61. # New: collect instruction lengths
  62. instruction_length = len(tokenizer.encode(instruction_plus_input))
  63. self.instruction_lengths.append(instruction_length)
  64. def __getitem__(self, index):
  65. # New: return both instruction lengths and texts separately
  66. return self.instruction_lengths[index], self.encoded_texts[index]
  67. def __len__(self):
  68. return len(self.data)
  69. class InstructionDatasetPhi(Dataset):
  70. def __init__(self, data, tokenizer):
  71. self.data = data
  72. # Pre-tokenize texts
  73. self.encoded_texts = []
  74. for entry in data:
  75. ###################################################################
  76. # NEW: Use `format_input_phi` and adjust the response text template
  77. instruction_plus_input = format_input_phi(entry)
  78. response_text = f"\n<|assistant|>:\n{entry['output']}"
  79. ###################################################################
  80. full_text = instruction_plus_input + response_text
  81. self.encoded_texts.append(
  82. tokenizer.encode(full_text)
  83. )
  84. def __getitem__(self, index):
  85. return self.encoded_texts[index]
  86. def __len__(self):
  87. return len(self.data)
  88. class LinearWithLoRA(torch.nn.Module):
  89. def __init__(self, linear, rank, alpha):
  90. super().__init__()
  91. self.linear = linear
  92. self.lora = LoRALayer(
  93. linear.in_features, linear.out_features, rank, alpha
  94. )
  95. def forward(self, x):
  96. return self.linear(x) + self.lora(x)
  97. class LoRALayer(torch.nn.Module):
  98. def __init__(self, in_dim, out_dim, rank, alpha):
  99. super().__init__()
  100. self.A = torch.nn.Parameter(torch.empty(in_dim, rank))
  101. torch.nn.init.kaiming_uniform_(self.A, a=math.sqrt(5)) # similar to standard weight initialization
  102. self.B = torch.nn.Parameter(torch.zeros(rank, out_dim))
  103. self.alpha = alpha
  104. def forward(self, x):
  105. x = self.alpha * (x @ self.A @ self.B)
  106. return x
  107. def replace_linear_with_lora(model, rank, alpha):
  108. for name, module in model.named_children():
  109. if isinstance(module, torch.nn.Linear):
  110. # Replace the Linear layer with LinearWithLoRA
  111. setattr(model, name, LinearWithLoRA(module, rank, alpha))
  112. else:
  113. # Recursively apply the same function to child modules
  114. replace_linear_with_lora(module, rank, alpha)
  115. def custom_collate_fn(
  116. batch,
  117. pad_token_id=50256,
  118. ignore_index=-100,
  119. allowed_max_length=None,
  120. device="cpu"
  121. ):
  122. # Find the longest sequence in the batch
  123. batch_max_length = max(len(item)+1 for item in batch)
  124. # Pad and prepare inputs and targets
  125. inputs_lst, targets_lst = [], []
  126. for item in batch:
  127. new_item = item.copy()
  128. # Add an <|endoftext|> token
  129. new_item += [pad_token_id]
  130. # Pad sequences to max_length
  131. padded = new_item + [pad_token_id] * (batch_max_length - len(new_item))
  132. inputs = torch.tensor(padded[:-1]) # Truncate the last token for inputs
  133. targets = torch.tensor(padded[1:]) # Shift +1 to the right for targets
  134. # New: Replace all but the first padding tokens in targets by ignore_index
  135. mask = targets == pad_token_id
  136. indices = torch.nonzero(mask).squeeze()
  137. if indices.numel() > 1:
  138. targets[indices[1:]] = ignore_index
  139. # New: Optionally truncate to maximum sequence length
  140. if allowed_max_length is not None:
  141. inputs = inputs[:allowed_max_length]
  142. targets = targets[:allowed_max_length]
  143. inputs_lst.append(inputs)
  144. targets_lst.append(targets)
  145. # Convert list of inputs and targets to tensors and transfer to target device
  146. inputs_tensor = torch.stack(inputs_lst).to(device)
  147. targets_tensor = torch.stack(targets_lst).to(device)
  148. return inputs_tensor, targets_tensor
  149. def custom_collate_with_masking_fn(
  150. batch,
  151. pad_token_id=50256,
  152. ignore_index=-100,
  153. allowed_max_length=None,
  154. device="cpu"
  155. ):
  156. # Find the longest sequence in the batch
  157. batch_max_length = max(len(item)+1 for instruction_length, item in batch) # New: batch is now a tuple
  158. # Pad and prepare inputs and targets
  159. inputs_lst, targets_lst = [], []
  160. for instruction_length, item in batch: # New: batch is now a tuple
  161. new_item = item.copy()
  162. # Add an <|endoftext|> token
  163. new_item += [pad_token_id]
  164. # Pad sequences to max_length
  165. padded = new_item + [pad_token_id] * (batch_max_length - len(new_item))
  166. inputs = torch.tensor(padded[:-1]) # Truncate the last token for inputs
  167. targets = torch.tensor(padded[1:]) # Shift +1 to the right for targets
  168. # Replace all but the first padding tokens in targets by ignore_index
  169. mask = targets == pad_token_id
  170. indices = torch.nonzero(mask).squeeze()
  171. if indices.numel() > 1:
  172. targets[indices[1:]] = ignore_index
  173. # New: Mask all input and instruction tokens in the targets
  174. targets[:instruction_length-1] = -100
  175. # Optionally truncate to maximum sequence length
  176. if allowed_max_length is not None:
  177. inputs = inputs[:allowed_max_length]
  178. targets = targets[:allowed_max_length]
  179. inputs_lst.append(inputs)
  180. targets_lst.append(targets)
  181. # Convert list of inputs and targets to tensors and transfer to target device
  182. inputs_tensor = torch.stack(inputs_lst).to(device)
  183. targets_tensor = torch.stack(targets_lst).to(device)
  184. return inputs_tensor, targets_tensor
  185. def download_and_load_file(file_path, url):
  186. if not os.path.exists(file_path):
  187. with urllib.request.urlopen(url) as response:
  188. text_data = response.read().decode("utf-8")
  189. with open(file_path, "w", encoding="utf-8") as file:
  190. file.write(text_data)
  191. else:
  192. with open(file_path, "r", encoding="utf-8") as file:
  193. text_data = file.read()
  194. with open(file_path, "r") as file:
  195. data = json.load(file)
  196. return data
  197. def format_input_phi(entry):
  198. instruction_text = (
  199. f"<|user|>\n{entry['instruction']}"
  200. )
  201. input_text = f"\n{entry['input']}" if entry["input"] else ""
  202. return instruction_text + input_text
  203. def format_input(entry):
  204. instruction_text = (
  205. f"Below is an instruction that describes a task. "
  206. f"Write a response that appropriately completes the request."
  207. f"\n\n### Instruction:\n{entry['instruction']}"
  208. )
  209. input_text = f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
  210. return instruction_text + input_text
  211. def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses, plot_name):
  212. fig, ax1 = plt.subplots(figsize=(12, 6))
  213. # Plot training and validation loss against epochs
  214. ax1.plot(epochs_seen, train_losses, label="Training loss")
  215. ax1.plot(epochs_seen, val_losses, linestyle="-.", label="Validation loss")
  216. ax1.set_xlabel("Epochs")
  217. ax1.set_ylabel("Loss")
  218. ax1.legend(loc="upper right")
  219. ax1.xaxis.set_major_locator(MaxNLocator(integer=True)) # only show integer labels on x-axis
  220. # Create a second x-axis for tokens seen
  221. ax2 = ax1.twiny() # Create a second x-axis that shares the same y-axis
  222. ax2.plot(tokens_seen, train_losses, alpha=0) # Invisible plot for aligning ticks
  223. ax2.set_xlabel("Tokens seen")
  224. fig.tight_layout() # Adjust layout to make room
  225. print(f"Plot saved as {plot_name}")
  226. plt.savefig(plot_name)
  227. # plt.show()
  228. def main(mask_instructions=False, alpaca52k=False, phi3_prompt=False, lora=False):
  229. #######################################
  230. # Print package versions
  231. #######################################
  232. print()
  233. pkgs = [
  234. "matplotlib", # Plotting library
  235. "tiktoken", # Tokenizer
  236. "torch", # Deep learning library
  237. "tqdm", # Progress bar
  238. "tensorflow", # For OpenAI's pretrained weights
  239. ]
  240. for p in pkgs:
  241. print(f"{p} version: {version(p)}")
  242. print(50*"-")
  243. #######################################
  244. # Download and prepare dataset
  245. #######################################
  246. file_path = "instruction-data.json"
  247. if alpaca52k:
  248. url = "https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json"
  249. else:
  250. url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch07/01_main-chapter-code/instruction-data.json"
  251. data = download_and_load_file(file_path, url)
  252. train_portion = int(len(data) * 0.85) # 85% for training
  253. test_portion = int(len(data) * 0.1) # 10% for testing
  254. train_data = data[:train_portion]
  255. test_data = data[train_portion:train_portion + test_portion]
  256. val_data = data[train_portion + test_portion:]
  257. print("Training set length:", len(train_data))
  258. print("Validation set length:", len(val_data))
  259. print("Test set length:", len(test_data))
  260. print(50*"-")
  261. tokenizer = tiktoken.get_encoding("gpt2")
  262. device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  263. print("Device:", device)
  264. print(50*"-")
  265. if alpaca52k:
  266. allowed_max_length = 512
  267. else:
  268. allowed_max_length = 1024
  269. if mask_instructions and phi3_prompt:
  270. raise ValueError("Simultaneous support for instruction masking and the Phi-3 prompt template has not been implemented, yet.")
  271. if mask_instructions:
  272. customized_collate_fn = partial(custom_collate_with_masking_fn, device=device, allowed_max_length=allowed_max_length)
  273. CustomDataset = InstructionDatasetWithMasking
  274. elif phi3_prompt:
  275. customized_collate_fn = partial(custom_collate_fn, device=device, allowed_max_length=allowed_max_length)
  276. CustomDataset = InstructionDatasetPhi
  277. else:
  278. customized_collate_fn = partial(custom_collate_fn, device=device, allowed_max_length=allowed_max_length)
  279. CustomDataset = InstructionDataset
  280. num_workers = 0
  281. if alpaca52k:
  282. batch_size = 4
  283. else:
  284. batch_size = 8
  285. torch.manual_seed(123)
  286. train_dataset = CustomDataset(train_data, tokenizer)
  287. train_loader = DataLoader(
  288. train_dataset,
  289. batch_size=batch_size,
  290. collate_fn=customized_collate_fn,
  291. shuffle=True,
  292. drop_last=True,
  293. num_workers=num_workers
  294. )
  295. val_dataset = CustomDataset(val_data, tokenizer)
  296. val_loader = DataLoader(
  297. val_dataset,
  298. batch_size=batch_size,
  299. collate_fn=customized_collate_fn,
  300. shuffle=False,
  301. drop_last=False,
  302. num_workers=num_workers
  303. )
  304. #######################################
  305. # Load pretrained model
  306. #######################################
  307. BASE_CONFIG = {
  308. "vocab_size": 50257, # Vocabulary size
  309. "context_length": 1024, # Context length
  310. "drop_rate": 0.0, # Dropout rate
  311. "qkv_bias": True # Query-key-value bias
  312. }
  313. model_configs = {
  314. "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
  315. "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
  316. "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
  317. "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
  318. }
  319. CHOOSE_MODEL = "gpt2-medium (355M)"
  320. BASE_CONFIG.update(model_configs[CHOOSE_MODEL])
  321. model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
  322. settings, params = download_and_load_gpt2(model_size=model_size, models_dir="gpt2")
  323. model = GPTModel(BASE_CONFIG)
  324. load_weights_into_gpt(model, params)
  325. model.eval()
  326. model.to(device)
  327. print("Loaded model:", CHOOSE_MODEL)
  328. print(50*"-")
  329. if lora:
  330. total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
  331. print(f"Total trainable parameters before: {total_params:,}")
  332. for param in model.parameters():
  333. param.requires_grad = False
  334. total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
  335. print(f"Total trainable parameters after: {total_params:,}")
  336. replace_linear_with_lora(model, rank=16, alpha=16)
  337. total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
  338. print(f"Total trainable LoRA parameters: {total_params:,}")
  339. model.to(device)
  340. #######################################
  341. # Finetuning the model
  342. #######################################
  343. print("Initial losses")
  344. with torch.no_grad():
  345. train_loss = calc_loss_loader(train_loader, model, device, num_batches=5)
  346. val_loss = calc_loss_loader(val_loader, model, device, num_batches=5)
  347. print(" Training loss:", train_loss)
  348. print(" Validation loss:", val_loss)
  349. start_time = time.time()
  350. num_epochs = 2
  351. optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.1)
  352. torch.manual_seed(123)
  353. start_context = format_input_phi(val_data[0]) if phi3_prompt else format_input(val_data[0])
  354. train_losses, val_losses, tokens_seen = train_model_simple(
  355. model, train_loader, val_loader, optimizer, device,
  356. num_epochs=num_epochs, eval_freq=5, eval_iter=5,
  357. start_context=start_context, tokenizer=tokenizer
  358. )
  359. end_time = time.time()
  360. execution_time_minutes = (end_time - start_time) / 60
  361. print(f"Training completed in {execution_time_minutes:.2f} minutes.")
  362. epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
  363. plot_name = "loss-plot.pdf"
  364. if mask_instructions:
  365. plot_name = plot_name.replace(".pdf", "-mask-instructions.pdf")
  366. if alpaca52k:
  367. plot_name = plot_name.replace(".pdf", "-alpaca52k.pdf")
  368. if phi3_prompt:
  369. plot_name = plot_name.replace(".pdf", "-phi3-prompt.pdf")
  370. if lora:
  371. plot_name = plot_name.replace(".pdf", "-lora.pdf")
  372. if not any([mask_instructions, alpaca52k, phi3_prompt, lora]):
  373. plot_name = plot_name.replace(".pdf", "-baseline.pdf")
  374. plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses, plot_name)
  375. print(50*"-")
  376. #######################################
  377. # Saving results
  378. #######################################
  379. print("Generating responses")
  380. for i, entry in tqdm(enumerate(test_data), total=len(test_data)):
  381. input_text = format_input_phi(entry) if phi3_prompt else format_input(entry)
  382. token_ids = generate(
  383. model=model,
  384. idx=text_to_token_ids(input_text, tokenizer).to(device),
  385. max_new_tokens=256,
  386. context_size=BASE_CONFIG["context_length"],
  387. eos_id=50256
  388. )
  389. generated_text = token_ids_to_text(token_ids, tokenizer)
  390. if phi3_prompt:
  391. response_text = generated_text[len(input_text):].replace("<|assistant|>:", "").strip()
  392. else:
  393. response_text = generated_text[len(input_text):].replace("### Response:", "").strip()
  394. test_data[i]["model_response"] = response_text
  395. test_data_path = "instruction-data-with-response.json"
  396. file_name = f"{re.sub(r'[ ()]', '', CHOOSE_MODEL) }-sft.pth"
  397. if mask_instructions:
  398. test_data_path = test_data_path.replace(".json", "-mask-instructions.json")
  399. file_name = file_name.replace(".pth", "-mask-instructions.pth")
  400. if alpaca52k:
  401. test_data_path = test_data_path.replace(".json", "-alpaca52k.json")
  402. file_name = file_name.replace(".pth", "-alpaca52k.pth")
  403. if phi3_prompt:
  404. test_data_path = test_data_path.replace(".json", "-phi3-prompt.json")
  405. file_name = file_name.replace(".pth", "-phi3-prompt.pth")
  406. if lora:
  407. test_data_path = test_data_path.replace(".json", "-lora.json")
  408. file_name = file_name.replace(".pth", "-lora.pth")
  409. if not any([mask_instructions, alpaca52k, phi3_prompt, lora]):
  410. test_data_path = test_data_path.replace(".json", "-baseline.json")
  411. file_name = file_name.replace(".pth", "-baseline.pth")
  412. with open(test_data_path, "w") as file:
  413. json.dump(test_data, file, indent=4) # "indent" for pretty-printing
  414. print(f"Responses saved as {test_data_path}")
  415. torch.save(model.state_dict(), file_name)
  416. print(f"Model saved as {file_name}")
  417. if __name__ == "__main__":
  418. import argparse
  419. parser = argparse.ArgumentParser(
  420. description="Instruction finetune a GPT model"
  421. )
  422. options = {"baseline", "mask_instructions", "alpaca_52k", "phi3_prompt", "lora"}
  423. parser.add_argument(
  424. "--exercise_solution",
  425. type=str,
  426. default="last_block",
  427. help=(
  428. f"Which experiment to run. Options: {options}."
  429. )
  430. )
  431. args = parser.parse_args()
  432. if args.exercise_solution == "baseline":
  433. main()
  434. elif args.exercise_solution == "mask_instructions":
  435. main(mask_instructions=True)
  436. elif args.exercise_solution == "alpaca_52k":
  437. main(alpaca52k=True)
  438. elif args.exercise_solution == "phi3_prompt":
  439. main(phi3_prompt=True)
  440. elif args.exercise_solution == "lora":
  441. main(lora=True)
  442. else:
  443. raise ValueError(f"{args.exercise_solution} is not a valid --args.exercise_solution option. Options: {options}")