Răsfoiți Sursa

use attn_scores from sec 3.4 instead of 3.3

rasbt 1 an în urmă
părinte
comite
250e6306e2

+ 44 - 32
ch03/01_main-chapter-code/ch03.ipynb

@@ -1055,16 +1055,23 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "tensor([[0.1972, 0.1910, 0.1894, 0.1361, 0.1344, 0.1520],\n",
-      "        [0.1476, 0.2164, 0.2134, 0.1365, 0.1240, 0.1621],\n",
-      "        [0.1479, 0.2157, 0.2129, 0.1366, 0.1260, 0.1608],\n",
-      "        [0.1505, 0.1952, 0.1933, 0.1525, 0.1375, 0.1711],\n",
-      "        [0.1571, 0.1874, 0.1885, 0.1453, 0.1819, 0.1399],\n",
-      "        [0.1473, 0.2033, 0.1996, 0.1500, 0.1160, 0.1839]])\n"
+      "tensor([[0.1921, 0.1646, 0.1652, 0.1550, 0.1721, 0.1510],\n",
+      "        [0.2041, 0.1659, 0.1662, 0.1496, 0.1665, 0.1477],\n",
+      "        [0.2036, 0.1659, 0.1662, 0.1498, 0.1664, 0.1480],\n",
+      "        [0.1869, 0.1667, 0.1668, 0.1571, 0.1661, 0.1564],\n",
+      "        [0.1830, 0.1669, 0.1670, 0.1588, 0.1658, 0.1585],\n",
+      "        [0.1935, 0.1663, 0.1666, 0.1542, 0.1666, 0.1529]],\n",
+      "       grad_fn=<SoftmaxBackward0>)\n"
      ]
     }
    ],
    "source": [
+    "# Reuse the query and key weight matrices of the\n",
+    "# SelfAttention_v2 object from the previous section for convenience\n",
+    "queries = sa_v2.W_query(inputs)\n",
+    "keys = sa_v2.W_key(inputs) \n",
+    "attn_scores = queries @ keys.T\n",
+    "\n",
     "attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=1)\n",
     "print(attn_weights)"
    ]
@@ -1120,12 +1127,13 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "tensor([[0.1972, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],\n",
-      "        [0.1476, 0.2164, 0.0000, 0.0000, 0.0000, 0.0000],\n",
-      "        [0.1479, 0.2157, 0.2129, 0.0000, 0.0000, 0.0000],\n",
-      "        [0.1505, 0.1952, 0.1933, 0.1525, 0.0000, 0.0000],\n",
-      "        [0.1571, 0.1874, 0.1885, 0.1453, 0.1819, 0.0000],\n",
-      "        [0.1473, 0.2033, 0.1996, 0.1500, 0.1160, 0.1839]])\n"
+      "tensor([[0.1921, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],\n",
+      "        [0.2041, 0.1659, 0.0000, 0.0000, 0.0000, 0.0000],\n",
+      "        [0.2036, 0.1659, 0.1662, 0.0000, 0.0000, 0.0000],\n",
+      "        [0.1869, 0.1667, 0.1668, 0.1571, 0.0000, 0.0000],\n",
+      "        [0.1830, 0.1669, 0.1670, 0.1588, 0.1658, 0.0000],\n",
+      "        [0.1935, 0.1663, 0.1666, 0.1542, 0.1666, 0.1529]],\n",
+      "       grad_fn=<MulBackward0>)\n"
      ]
     }
    ],
@@ -1161,11 +1169,12 @@
      "output_type": "stream",
      "text": [
       "tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],\n",
-      "        [0.4056, 0.5944, 0.0000, 0.0000, 0.0000, 0.0000],\n",
-      "        [0.2566, 0.3741, 0.3693, 0.0000, 0.0000, 0.0000],\n",
-      "        [0.2176, 0.2823, 0.2796, 0.2205, 0.0000, 0.0000],\n",
-      "        [0.1826, 0.2178, 0.2191, 0.1689, 0.2115, 0.0000],\n",
-      "        [0.1473, 0.2033, 0.1996, 0.1500, 0.1160, 0.1839]])\n"
+      "        [0.5517, 0.4483, 0.0000, 0.0000, 0.0000, 0.0000],\n",
+      "        [0.3800, 0.3097, 0.3103, 0.0000, 0.0000, 0.0000],\n",
+      "        [0.2758, 0.2460, 0.2462, 0.2319, 0.0000, 0.0000],\n",
+      "        [0.2175, 0.1983, 0.1984, 0.1888, 0.1971, 0.0000],\n",
+      "        [0.1935, 0.1663, 0.1666, 0.1542, 0.1666, 0.1529]],\n",
+      "       grad_fn=<DivBackward0>)\n"
      ]
     }
    ],
@@ -1194,12 +1203,13 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "tensor([[0.9995,   -inf,   -inf,   -inf,   -inf,   -inf],\n",
-      "        [0.9544, 1.4950,   -inf,   -inf,   -inf,   -inf],\n",
-      "        [0.9422, 1.4754, 1.4570,   -inf,   -inf,   -inf],\n",
-      "        [0.4753, 0.8434, 0.8296, 0.4937,   -inf,   -inf],\n",
-      "        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654,   -inf],\n",
-      "        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])\n"
+      "tensor([[0.2899,   -inf,   -inf,   -inf,   -inf,   -inf],\n",
+      "        [0.4656, 0.1723,   -inf,   -inf,   -inf,   -inf],\n",
+      "        [0.4594, 0.1703, 0.1731,   -inf,   -inf,   -inf],\n",
+      "        [0.2642, 0.1024, 0.1036, 0.0186,   -inf,   -inf],\n",
+      "        [0.2183, 0.0874, 0.0882, 0.0177, 0.0786,   -inf],\n",
+      "        [0.3408, 0.1270, 0.1290, 0.0198, 0.1290, 0.0078]],\n",
+      "       grad_fn=<MaskedFillBackward0>)\n"
      ]
     }
    ],
@@ -1228,11 +1238,12 @@
      "output_type": "stream",
      "text": [
       "tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],\n",
-      "        [0.4056, 0.5944, 0.0000, 0.0000, 0.0000, 0.0000],\n",
-      "        [0.2566, 0.3741, 0.3693, 0.0000, 0.0000, 0.0000],\n",
-      "        [0.2176, 0.2823, 0.2796, 0.2205, 0.0000, 0.0000],\n",
-      "        [0.1826, 0.2178, 0.2191, 0.1689, 0.2115, 0.0000],\n",
-      "        [0.1473, 0.2033, 0.1996, 0.1500, 0.1160, 0.1839]])\n"
+      "        [0.5517, 0.4483, 0.0000, 0.0000, 0.0000, 0.0000],\n",
+      "        [0.3800, 0.3097, 0.3103, 0.0000, 0.0000, 0.0000],\n",
+      "        [0.2758, 0.2460, 0.2462, 0.2319, 0.0000, 0.0000],\n",
+      "        [0.2175, 0.1983, 0.1984, 0.1888, 0.1971, 0.0000],\n",
+      "        [0.1935, 0.1663, 0.1666, 0.1542, 0.1666, 0.1529]],\n",
+      "       grad_fn=<SoftmaxBackward0>)\n"
      ]
     }
    ],
@@ -1318,10 +1329,11 @@
      "text": [
       "tensor([[2.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],\n",
       "        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],\n",
-      "        [0.5132, 0.7482, 0.7386, 0.0000, 0.0000, 0.0000],\n",
-      "        [0.0000, 0.5646, 0.5592, 0.0000, 0.0000, 0.0000],\n",
-      "        [0.0000, 0.4357, 0.0000, 0.3378, 0.0000, 0.0000],\n",
-      "        [0.0000, 0.4065, 0.3991, 0.2999, 0.2320, 0.0000]])\n"
+      "        [0.7599, 0.6194, 0.6206, 0.0000, 0.0000, 0.0000],\n",
+      "        [0.0000, 0.4921, 0.4925, 0.0000, 0.0000, 0.0000],\n",
+      "        [0.0000, 0.3966, 0.0000, 0.3775, 0.0000, 0.0000],\n",
+      "        [0.0000, 0.3327, 0.3331, 0.3084, 0.3331, 0.0000]],\n",
+      "       grad_fn=<MulBackward0>)\n"
      ]
     }
    ],

BIN
ch03/01_main-chapter-code/figures/dropout.png


BIN
ch03/01_main-chapter-code/figures/masked.png


+ 1 - 1
ch04/01_main-chapter-code/ch04.ipynb

@@ -836,7 +836,7 @@
     "        x = self.drop_resid(x)\n",
     "        x = x + shortcut  # Add the original input back\n",
     "\n",
-    "        # Shortcut connection for feed-forward block\n",
+    "        # Shortcut connection for feed forward block\n",
     "        shortcut = x\n",
     "        x = self.norm2(x)\n",
     "        x = self.ff(x)\n",