| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148 |
- # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
- # Source for "Build a Large Language Model From Scratch"
- # - https://www.manning.com/books/build-a-large-language-model-from-scratch
- # Code: https://github.com/rasbt/LLMs-from-scratch
- from llms_from_scratch.ch04 import GPTModel
- from llms_from_scratch.ch06 import (
- download_and_unzip_spam_data, create_balanced_dataset,
- random_split, SpamDataset, train_classifier_simple
- )
- from pathlib import Path
- import urllib
- import pandas as pd
- import tiktoken
- import torch
- from torch.utils.data import DataLoader, Subset
- def test_train_classifier(tmp_path):
- ########################################
- # Download and prepare dataset
- ########################################
- url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
- zip_path = tmp_path / "sms_spam_collection.zip"
- extracted_path = tmp_path / "sms_spam_collection"
- data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"
- try:
- download_and_unzip_spam_data(
- url, zip_path, extracted_path, data_file_path
- )
- except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
- print(f"Primary URL failed: {e}. Trying backup URL...")
- backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
- download_and_unzip_spam_data(
- backup_url, zip_path, extracted_path, data_file_path
- )
- df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])
- balanced_df = create_balanced_dataset(df)
- balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})
- train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)
- train_df.to_csv(tmp_path / "train.csv", index=None)
- validation_df.to_csv(tmp_path / "validation.csv", index=None)
- test_df.to_csv(tmp_path / "test.csv", index=None)
- ########################################
- # Create data loaders
- ########################################
- tokenizer = tiktoken.get_encoding("gpt2")
- train_dataset = SpamDataset(
- csv_file=tmp_path / "train.csv",
- max_length=None,
- tokenizer=tokenizer
- )
- val_dataset = SpamDataset(
- csv_file=tmp_path / "validation.csv",
- max_length=train_dataset.max_length,
- tokenizer=tokenizer
- )
- num_workers = 0
- batch_size = 8
- torch.manual_seed(123)
- train_loader = DataLoader(
- dataset=train_dataset,
- batch_size=batch_size,
- shuffle=True,
- num_workers=num_workers,
- drop_last=True,
- )
- val_loader = DataLoader(
- dataset=val_dataset,
- batch_size=batch_size,
- num_workers=num_workers,
- drop_last=False,
- )
- ########################################
- # Load pretrained model
- ########################################
- # Small GPT model for testing purposes
- BASE_CONFIG = {
- "vocab_size": 50257,
- "context_length": 120,
- "drop_rate": 0.0,
- "qkv_bias": False,
- "emb_dim": 12,
- "n_layers": 1,
- "n_heads": 2
- }
- model = GPTModel(BASE_CONFIG)
- model.eval()
- device = "cpu"
- ########################################
- # Modify and pretrained model
- ########################################
- for param in model.parameters():
- param.requires_grad = False
- torch.manual_seed(123)
- num_classes = 2
- model.out_head = torch.nn.Linear(in_features=BASE_CONFIG["emb_dim"], out_features=num_classes)
- model.to(device)
- for param in model.trf_blocks[-1].parameters():
- param.requires_grad = True
- for param in model.final_norm.parameters():
- param.requires_grad = True
- ########################################
- # Finetune modified model
- ########################################
- torch.manual_seed(123)
- optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.0)
- train_subset = Subset(train_loader.dataset, range(5))
- batch_train_loader = DataLoader(train_subset, batch_size=5)
- val_subset = Subset(val_loader.dataset, range(5))
- batch_val_loader = DataLoader(val_subset, batch_size=5)
- num_epochs = 5
- train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier_simple(
- model, batch_train_loader, batch_val_loader, optimizer, device,
- num_epochs=num_epochs, eval_freq=1, eval_iter=1,
- )
- assert round(train_losses[0], 1) == 0.8
- assert round(val_losses[0], 1) == 0.8
- assert train_losses[-1] < train_losses[0]
|