ch02.py 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546
  1. # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
  2. # Source for "Build a Large Language Model From Scratch"
  3. # - https://www.manning.com/books/build-a-large-language-model-from-scratch
  4. # Code: https://github.com/rasbt/LLMs-from-scratch
  5. import torch
  6. from torch.utils.data import Dataset, DataLoader
  7. import tiktoken
  8. class GPTDatasetV1(Dataset):
  9. def __init__(self, txt, tokenizer, max_length, stride):
  10. self.tokenizer = tokenizer
  11. self.input_ids = []
  12. self.target_ids = []
  13. # Tokenize the entire text
  14. token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
  15. # Use a sliding window to chunk the book into overlapping sequences of max_length
  16. for i in range(0, len(token_ids) - max_length, stride):
  17. input_chunk = token_ids[i:i + max_length]
  18. target_chunk = token_ids[i + 1: i + max_length + 1]
  19. self.input_ids.append(torch.tensor(input_chunk))
  20. self.target_ids.append(torch.tensor(target_chunk))
  21. def __len__(self):
  22. return len(self.input_ids)
  23. def __getitem__(self, idx):
  24. return self.input_ids[idx], self.target_ids[idx]
  25. def create_dataloader_v1(txt, batch_size=4, max_length=256,
  26. stride=128, shuffle=True, drop_last=True, num_workers=0):
  27. # Initialize the tokenizer
  28. tokenizer = tiktoken.get_encoding("gpt2")
  29. # Create dataset
  30. dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
  31. # Create dataloader
  32. dataloader = DataLoader(
  33. dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
  34. return dataloader