|
@@ -248,7 +248,7 @@
|
|
|
},
|
|
},
|
|
|
{
|
|
{
|
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
|
- "execution_count": 10,
|
|
|
|
|
|
|
+ "execution_count": 11,
|
|
|
"id": "4d50af16-937b-49e0-8ffd-42d30cbb41c9",
|
|
"id": "4d50af16-937b-49e0-8ffd-42d30cbb41c9",
|
|
|
"metadata": {},
|
|
"metadata": {},
|
|
|
"outputs": [],
|
|
"outputs": [],
|
|
@@ -260,12 +260,11 @@
|
|
|
"\n",
|
|
"\n",
|
|
|
"class GPTDatasetV1(Dataset):\n",
|
|
"class GPTDatasetV1(Dataset):\n",
|
|
|
" def __init__(self, txt, tokenizer, max_length, stride):\n",
|
|
" def __init__(self, txt, tokenizer, max_length, stride):\n",
|
|
|
- " self.tokenizer = tokenizer\n",
|
|
|
|
|
" self.input_ids = []\n",
|
|
" self.input_ids = []\n",
|
|
|
" self.target_ids = []\n",
|
|
" self.target_ids = []\n",
|
|
|
"\n",
|
|
"\n",
|
|
|
" # Tokenize the entire text\n",
|
|
" # Tokenize the entire text\n",
|
|
|
- " token_ids = self.tokenizer.encode(txt)\n",
|
|
|
|
|
|
|
+ " token_ids = tokenizer.encode(txt)\n",
|
|
|
"\n",
|
|
"\n",
|
|
|
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
|
|
" # Use a sliding window to chunk the book into overlapping sequences of max_length\n",
|
|
|
" for i in range(0, len(token_ids) - max_length, stride):\n",
|
|
" for i in range(0, len(token_ids) - max_length, stride):\n",
|
|
@@ -311,7 +310,7 @@
|
|
|
},
|
|
},
|
|
|
{
|
|
{
|
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
|
- "execution_count": 11,
|
|
|
|
|
|
|
+ "execution_count": 12,
|
|
|
"id": "0128eefa-d7c8-4f76-9851-566dfa7c3745",
|
|
"id": "0128eefa-d7c8-4f76-9851-566dfa7c3745",
|
|
|
"metadata": {},
|
|
"metadata": {},
|
|
|
"outputs": [
|
|
"outputs": [
|
|
@@ -324,7 +323,7 @@
|
|
|
" [ 402, 271]])"
|
|
" [ 402, 271]])"
|
|
|
]
|
|
]
|
|
|
},
|
|
},
|
|
|
- "execution_count": 11,
|
|
|
|
|
|
|
+ "execution_count": 12,
|
|
|
"metadata": {},
|
|
"metadata": {},
|
|
|
"output_type": "execute_result"
|
|
"output_type": "execute_result"
|
|
|
}
|
|
}
|
|
@@ -341,7 +340,7 @@
|
|
|
},
|
|
},
|
|
|
{
|
|
{
|
|
|
"cell_type": "code",
|
|
"cell_type": "code",
|
|
|
- "execution_count": 12,
|
|
|
|
|
|
|
+ "execution_count": 13,
|
|
|
"id": "ff5c1e90-c6de-4a87-adf6-7e19f603291c",
|
|
"id": "ff5c1e90-c6de-4a87-adf6-7e19f603291c",
|
|
|
"metadata": {},
|
|
"metadata": {},
|
|
|
"outputs": [
|
|
"outputs": [
|
|
@@ -354,7 +353,7 @@
|
|
|
" [ 402, 271, 10899, 2138, 257, 7026, 15632, 438]])"
|
|
" [ 402, 271, 10899, 2138, 257, 7026, 15632, 438]])"
|
|
|
]
|
|
]
|
|
|
},
|
|
},
|
|
|
- "execution_count": 12,
|
|
|
|
|
|
|
+ "execution_count": 13,
|
|
|
"metadata": {},
|
|
"metadata": {},
|
|
|
"output_type": "execute_result"
|
|
"output_type": "execute_result"
|
|
|
}
|
|
}
|