gpt_download.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
  2. # Source for "Build a Large Language Model From Scratch"
  3. # - https://www.manning.com/books/build-a-large-language-model-from-scratch
  4. # Code: https://github.com/rasbt/LLMs-from-scratch
  5. import os
  6. import urllib.request
  7. # import requests
  8. import json
  9. import numpy as np
  10. import tensorflow as tf
  11. from tqdm import tqdm
  12. def download_and_load_gpt2(model_size, models_dir):
  13. # Validate model size
  14. allowed_sizes = ("124M", "355M", "774M", "1558M")
  15. if model_size not in allowed_sizes:
  16. raise ValueError(f"Model size not in {allowed_sizes}")
  17. # Define paths
  18. model_dir = os.path.join(models_dir, model_size)
  19. base_url = "https://openaipublic.blob.core.windows.net/gpt-2/models"
  20. backup_base_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/gpt2"
  21. filenames = [
  22. "checkpoint", "encoder.json", "hparams.json",
  23. "model.ckpt.data-00000-of-00001", "model.ckpt.index",
  24. "model.ckpt.meta", "vocab.bpe"
  25. ]
  26. # Download files
  27. os.makedirs(model_dir, exist_ok=True)
  28. for filename in filenames:
  29. file_url = os.path.join(base_url, model_size, filename)
  30. backup_url = os.path.join(backup_base_url, model_size, filename)
  31. file_path = os.path.join(model_dir, filename)
  32. download_file(file_url, file_path, backup_url)
  33. # Load settings and params
  34. tf_ckpt_path = tf.train.latest_checkpoint(model_dir)
  35. settings = json.load(open(os.path.join(model_dir, "hparams.json"), "r", encoding="utf-8"))
  36. params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, settings)
  37. return settings, params
  38. def download_file(url, destination, backup_url=None):
  39. def _attempt_download(download_url):
  40. with urllib.request.urlopen(download_url) as response:
  41. # Get the total file size from headers, defaulting to 0 if not present
  42. file_size = int(response.headers.get("Content-Length", 0))
  43. # Check if file exists and has the same size
  44. if os.path.exists(destination):
  45. file_size_local = os.path.getsize(destination)
  46. if file_size == file_size_local:
  47. print(f"File already exists and is up-to-date: {destination}")
  48. return True # Indicate success without re-downloading
  49. block_size = 1024 # 1 Kilobyte
  50. # Initialize the progress bar with total file size
  51. progress_bar_description = os.path.basename(download_url)
  52. with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
  53. with open(destination, "wb") as file:
  54. while True:
  55. chunk = response.read(block_size)
  56. if not chunk:
  57. break
  58. file.write(chunk)
  59. progress_bar.update(len(chunk))
  60. return True
  61. try:
  62. if _attempt_download(url):
  63. return
  64. except (urllib.error.HTTPError, urllib.error.URLError):
  65. if backup_url is not None:
  66. print(f"Primary URL ({url}) failed. Attempting backup URL: {backup_url}")
  67. try:
  68. if _attempt_download(backup_url):
  69. return
  70. except urllib.error.HTTPError:
  71. pass
  72. # If we reach here, both attempts have failed
  73. error_message = (
  74. f"Failed to download from both primary URL ({url})"
  75. f"{' and backup URL (' + backup_url + ')' if backup_url else ''}."
  76. "\nCheck your internet connection or the file availability.\n"
  77. "For help, visit: https://github.com/rasbt/LLMs-from-scratch/discussions/273"
  78. )
  79. print(error_message)
  80. except Exception as e:
  81. print(f"An unexpected error occurred: {e}")
  82. # Alternative way using `requests`
  83. """
  84. def download_file(url, destination):
  85. # Send a GET request to download the file in streaming mode
  86. response = requests.get(url, stream=True)
  87. # Get the total file size from headers, defaulting to 0 if not present
  88. file_size = int(response.headers.get("content-length", 0))
  89. # Check if file exists and has the same size
  90. if os.path.exists(destination):
  91. file_size_local = os.path.getsize(destination)
  92. if file_size == file_size_local:
  93. print(f"File already exists and is up-to-date: {destination}")
  94. return
  95. # Define the block size for reading the file
  96. block_size = 1024 # 1 Kilobyte
  97. # Initialize the progress bar with total file size
  98. progress_bar_description = url.split("/")[-1] # Extract filename from URL
  99. with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
  100. # Open the destination file in binary write mode
  101. with open(destination, "wb") as file:
  102. # Iterate over the file data in chunks
  103. for chunk in response.iter_content(block_size):
  104. progress_bar.update(len(chunk)) # Update progress bar
  105. file.write(chunk) # Write the chunk to the file
  106. """
  107. def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):
  108. # Initialize parameters dictionary with empty blocks for each layer
  109. params = {"blocks": [{} for _ in range(settings["n_layer"])]}
  110. # Iterate over each variable in the checkpoint
  111. for name, _ in tf.train.list_variables(ckpt_path):
  112. # Load the variable and remove singleton dimensions
  113. variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name))
  114. # Process the variable name to extract relevant parts
  115. variable_name_parts = name.split("/")[1:] # Skip the 'model/' prefix
  116. # Identify the target dictionary for the variable
  117. target_dict = params
  118. if variable_name_parts[0].startswith("h"):
  119. layer_number = int(variable_name_parts[0][1:])
  120. target_dict = params["blocks"][layer_number]
  121. # Recursively access or create nested dictionaries
  122. for key in variable_name_parts[1:-1]:
  123. target_dict = target_dict.setdefault(key, {})
  124. # Assign the variable array to the last key
  125. last_key = variable_name_parts[-1]
  126. target_dict[last_key] = variable_array
  127. return params