| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247 |
- # Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
- # Source for "Build a Large Language Model From Scratch"
- # - https://www.manning.com/books/build-a-large-language-model-from-scratch
- # Code: https://github.com/rasbt/LLMs-from-scratch
- import json
- import os
- import psutil
- import urllib
- import torch
- from tqdm import tqdm
- from torch.utils.data import Dataset
- def download_and_load_file(file_path, url):
- if not os.path.exists(file_path):
- with urllib.request.urlopen(url) as response:
- text_data = response.read().decode("utf-8")
- with open(file_path, "w", encoding="utf-8") as file:
- file.write(text_data)
- # The book originally contained this unnecessary "else" clause:
- # else:
- # with open(file_path, "r", encoding="utf-8") as file:
- # text_data = file.read()
- with open(file_path, "r", encoding="utf-8") as file:
- data = json.load(file)
- return data
- def format_input(entry):
- instruction_text = (
- f"Below is an instruction that describes a task. "
- f"Write a response that appropriately completes the request."
- f"\n\n### Instruction:\n{entry['instruction']}"
- )
- input_text = f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
- return instruction_text + input_text
- class InstructionDataset(Dataset):
- def __init__(self, data, tokenizer):
- self.data = data
- # Pre-tokenize texts
- self.encoded_texts = []
- for entry in data:
- instruction_plus_input = format_input(entry)
- response_text = f"\n\n### Response:\n{entry['output']}"
- full_text = instruction_plus_input + response_text
- self.encoded_texts.append(
- tokenizer.encode(full_text)
- )
- def __getitem__(self, index):
- return self.encoded_texts[index]
- def __len__(self):
- return len(self.data)
- def custom_collate_draft_1(
- batch,
- pad_token_id=50256,
- device="cpu"
- ):
- # Find the longest sequence in the batch
- # and increase the max length by +1, which will add one extra
- # padding token below
- batch_max_length = max(len(item)+1 for item in batch)
- # Pad and prepare inputs
- inputs_lst = []
- for item in batch:
- new_item = item.copy()
- # Add an <|endoftext|> token
- new_item += [pad_token_id]
- # Pad sequences to batch_max_length
- padded = (
- new_item + [pad_token_id] *
- (batch_max_length - len(new_item))
- )
- # Via padded[:-1], we remove the extra padded token
- # that has been added via the +1 setting in batch_max_length
- # (the extra padding token will be relevant in later codes)
- inputs = torch.tensor(padded[:-1])
- inputs_lst.append(inputs)
- # Convert list of inputs to tensor and transfer to target device
- inputs_tensor = torch.stack(inputs_lst).to(device)
- return inputs_tensor
- def custom_collate_draft_2(
- batch,
- pad_token_id=50256,
- device="cpu"
- ):
- # Find the longest sequence in the batch
- batch_max_length = max(len(item)+1 for item in batch)
- # Pad and prepare inputs
- inputs_lst, targets_lst = [], []
- for item in batch:
- new_item = item.copy()
- # Add an <|endoftext|> token
- new_item += [pad_token_id]
- # Pad sequences to max_length
- padded = (
- new_item + [pad_token_id] *
- (batch_max_length - len(new_item))
- )
- inputs = torch.tensor(padded[:-1]) # Truncate the last token for inputs
- targets = torch.tensor(padded[1:]) # Shift +1 to the right for targets
- inputs_lst.append(inputs)
- targets_lst.append(targets)
- # Convert list of inputs to tensor and transfer to target device
- inputs_tensor = torch.stack(inputs_lst).to(device)
- targets_tensor = torch.stack(targets_lst).to(device)
- return inputs_tensor, targets_tensor
- def custom_collate_fn(
- batch,
- pad_token_id=50256,
- ignore_index=-100,
- allowed_max_length=None,
- device="cpu"
- ):
- # Find the longest sequence in the batch
- batch_max_length = max(len(item)+1 for item in batch)
- # Pad and prepare inputs and targets
- inputs_lst, targets_lst = [], []
- for item in batch:
- new_item = item.copy()
- # Add an <|endoftext|> token
- new_item += [pad_token_id]
- # Pad sequences to max_length
- padded = (
- new_item + [pad_token_id] *
- (batch_max_length - len(new_item))
- )
- inputs = torch.tensor(padded[:-1]) # Truncate the last token for inputs
- targets = torch.tensor(padded[1:]) # Shift +1 to the right for targets
- # New: Replace all but the first padding tokens in targets by ignore_index
- mask = targets == pad_token_id
- indices = torch.nonzero(mask).squeeze()
- if indices.numel() > 1:
- targets[indices[1:]] = ignore_index
- # New: Optionally truncate to maximum sequence length
- if allowed_max_length is not None:
- inputs = inputs[:allowed_max_length]
- targets = targets[:allowed_max_length]
- inputs_lst.append(inputs)
- targets_lst.append(targets)
- # Convert list of inputs and targets to tensors and transfer to target device
- inputs_tensor = torch.stack(inputs_lst).to(device)
- targets_tensor = torch.stack(targets_lst).to(device)
- return inputs_tensor, targets_tensor
- def check_if_running(process_name):
- running = False
- for proc in psutil.process_iter(["name"]):
- if process_name in proc.info["name"]:
- running = True
- break
- return running
- def query_model(
- prompt,
- model="llama3",
- url="http://localhost:11434/api/chat"
- ):
- # Create the data payload as a dictionary
- data = {
- "model": model,
- "messages": [
- {"role": "user", "content": prompt}
- ],
- "options": { # Settings below are required for deterministic responses
- "seed": 123,
- "temperature": 0,
- "num_ctx": 2048
- }
- }
- # Convert the dictionary to a JSON formatted string and encode it to bytes
- payload = json.dumps(data).encode("utf-8")
- # Create a request object, setting the method to POST and adding necessary headers
- request = urllib.request.Request(
- url,
- data=payload,
- method="POST"
- )
- request.add_header("Content-Type", "application/json")
- # Send the request and capture the response
- response_data = ""
- with urllib.request.urlopen(request) as response:
- # Read and decode the response
- while True:
- line = response.readline().decode("utf-8")
- if not line:
- break
- response_json = json.loads(line)
- response_data += response_json["message"]["content"]
- return response_data
- def generate_model_scores(json_data, json_key, model="llama3"):
- scores = []
- for entry in tqdm(json_data, desc="Scoring entries"):
- prompt = (
- f"Given the input `{format_input(entry)}` "
- f"and correct output `{entry['output']}`, "
- f"score the model response `{entry[json_key]}`"
- f" on a scale from 0 to 100, where 100 is the best score. "
- f"Respond with the integer number only."
- )
- score = query_model(prompt, model)
- try:
- scores.append(int(score))
- except ValueError:
- print(f"Could not convert score: {score}")
- continue
- return scores
|