1 рік тому · 7e78b52a30
--- a/ch02/01_main-chapter-code/ch02.ipynb
+++ b/ch02/01_main-chapter-code/ch02.ipynb
@@ -785,9 +785,6 @@
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
 
				-    "preprocessed = re.split(r'([,.?_!\"()\\']|--|\\s)', raw_text)\n",
			
 
				-    "preprocessed = [item.strip() for item in preprocessed if item.strip()]\n",
			
 
				-    "\n",
			
 
				     "all_tokens = sorted(list(set(preprocessed)))\n",
			
 
				     "all_tokens.extend([\"<|endoftext|>\", \"<|unk|>\"])\n",
			
 
				     "\n",
			
@@ -803,7 +800,7 @@
 
				     {
			
 
				      "data": {
			
 
				       "text/plain": [
			
 
				-       "1161"
			
 
				+       "1132"
			
 
				       ]
			
 
				      },
			
 
				      "execution_count": 19,
			
@@ -825,11 +822,11 @@
 
				      "name": "stdout",
			
 
				      "output_type": "stream",
			
 
				      "text": [
			
 
				-      "('younger', 1156)\n",
			
 
				-      "('your', 1157)\n",
			
 
				-      "('yourself', 1158)\n",
			
 
				-      "('<|endoftext|>', 1159)\n",
			
 
				-      "('<|unk|>', 1160)\n"
			
 
				+      "('younger', 1127)\n",
			
 
				+      "('your', 1128)\n",
			
 
				+      "('yourself', 1129)\n",
			
 
				+      "('<|endoftext|>', 1130)\n",
			
 
				+      "('<|unk|>', 1131)\n"
			
 
				      ]
			
 
				     }
			
 
				    ],
			
@@ -918,22 +915,7 @@
 
				     {
			
 
				      "data": {
			
 
				       "text/plain": [
			
 
				-       "[1160,\n",
			
 
				-       " 5,\n",
			
 
				-       " 362,\n",
			
 
				-       " 1155,\n",
			
 
				-       " 642,\n",
			
 
				-       " 1000,\n",
			
 
				-       " 10,\n",
			
 
				-       " 1159,\n",
			
 
				-       " 57,\n",
			
 
				-       " 1013,\n",
			
 
				-       " 981,\n",
			
 
				-       " 1009,\n",
			
 
				-       " 738,\n",
			
 
				-       " 1013,\n",
			
 
				-       " 1160,\n",
			
 
				-       " 7]"
			
 
				+       "[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]"
			
 
				       ]
			
 
				      },
			
 
				      "execution_count": 23,