@@ -785,9 +785,6 @@
"metadata": {},
"outputs": [],
"source": [
- "preprocessed = re.split(r'([,.?_!\"()\\']|--|\\s)', raw_text)\n",
- "preprocessed = [item.strip() for item in preprocessed if item.strip()]\n",
- "\n",
"all_tokens = sorted(list(set(preprocessed)))\n",
"all_tokens.extend([\"<|endoftext|>\", \"<|unk|>\"])\n",
"\n",
@@ -803,7 +800,7 @@
{
"data": {
"text/plain": [
- "1161"
+ "1132"
]
},
"execution_count": 19,
@@ -825,11 +822,11 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "('younger', 1156)\n",
- "('your', 1157)\n",
- "('yourself', 1158)\n",
- "('<|endoftext|>', 1159)\n",
- "('<|unk|>', 1160)\n"
+ "('younger', 1127)\n",
+ "('your', 1128)\n",
+ "('yourself', 1129)\n",
+ "('<|endoftext|>', 1130)\n",
+ "('<|unk|>', 1131)\n"
}
],
@@ -918,22 +915,7 @@
- "[1160,\n",
- " 5,\n",
- " 362,\n",
- " 1155,\n",
- " 642,\n",
- " 1000,\n",
- " 10,\n",
- " 1159,\n",
- " 57,\n",
- " 1013,\n",
- " 981,\n",
- " 1009,\n",
- " 738,\n",
- " 1160,\n",
- " 7]"
+ "[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]"
"execution_count": 23,