| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546 |
- # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
- # Source for "Build a Large Language Model From Scratch"
- # - https://www.manning.com/books/build-a-large-language-model-from-scratch
- # Code: https://github.com/rasbt/LLMs-from-scratch
- import torch
- from torch.utils.data import Dataset, DataLoader
- import tiktoken
- class GPTDatasetV1(Dataset):
- def __init__(self, txt, tokenizer, max_length, stride):
- self.tokenizer = tokenizer
- self.input_ids = []
- self.target_ids = []
- # Tokenize the entire text
- token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
- # Use a sliding window to chunk the book into overlapping sequences of max_length
- for i in range(0, len(token_ids) - max_length, stride):
- input_chunk = token_ids[i:i + max_length]
- target_chunk = token_ids[i + 1: i + max_length + 1]
- self.input_ids.append(torch.tensor(input_chunk))
- self.target_ids.append(torch.tensor(target_chunk))
- def __len__(self):
- return len(self.input_ids)
- def __getitem__(self, idx):
- return self.input_ids[idx], self.target_ids[idx]
- def create_dataloader_v1(txt, batch_size=4, max_length=256,
- stride=128, shuffle=True, drop_last=True, num_workers=0):
- # Initialize the tokenizer
- tokenizer = tiktoken.get_encoding("gpt2")
- # Create dataset
- dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
- # Create dataloader
- dataloader = DataLoader(
- dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
- return dataloader
|