Browse Source

bridge: add initial set of basic Prometheus metrics

Leo 4 years ago
parent
commit
14441680d0

+ 1 - 1
Tiltfile

@@ -84,7 +84,7 @@ def build_bridge_yaml():
 k8s_yaml_with_ns(build_bridge_yaml())
 
 k8s_resource("guardian", resource_deps=["proto-gen"], port_forwards=[
-    port_forward(6060, name="Debug Server [:6060]"),
+    port_forward(6060, name="Debug/Status Server [:6060]"),
 ])
 
 # solana agent and cli (runs alongside bridge)

+ 30 - 7
bridge/cmd/guardiand/bridge.go

@@ -4,6 +4,8 @@ import (
 	"context"
 	"fmt"
 	solana_types "github.com/dfuse-io/solana-go"
+	"github.com/gorilla/mux"
+	"github.com/prometheus/client_golang/prometheus/promhttp"
 	"net/http"
 	_ "net/http/pprof"
 	"os"
@@ -42,6 +44,8 @@ var (
 
 	adminSocketPath *string
 
+	statusAddr *string
+
 	bridgeKeyPath       *string
 	solanaBridgeAddress *string
 
@@ -73,6 +77,8 @@ func init() {
 	p2pPort = BridgeCmd.Flags().Uint("port", 8999, "P2P UDP listener port")
 	p2pBootstrap = BridgeCmd.Flags().String("bootstrap", "", "P2P bootstrap peers (comma-separated)")
 
+	statusAddr = BridgeCmd.Flags().String("statusAddr", "Listen address for status server (disabled if blank)", "[::1]:6060")
+
 	nodeKeyPath = BridgeCmd.Flags().String("nodeKey", "", "Path to node key (will be generated if it doesn't exist)")
 
 	adminSocketPath = BridgeCmd.Flags().String("adminSocket", "", "Admin gRPC service UNIX domain socket path")
@@ -189,16 +195,33 @@ func runBridge(cmd *cobra.Command, args []string) {
 		readiness.RegisterComponent(common.ReadinessTerraSyncing)
 	}
 
-	// In devnet mode, we automatically set a number of flags that rely on deterministic keys.
-	if *unsafeDevMode {
-		go func() {
-			// TODO: once monitoring server is implemented, move this to that http server instance
-			http.HandleFunc("/readyz", readiness.Handler)
+	if *statusAddr != "" {
+		// Use a custom routing instead of using http.DefaultServeMux directly to avoid accidentally exposing packages
+		// that register themselves with it by default (like pprof).
+		router := mux.NewRouter()
+
+		// pprof server. NOT necessarily safe to expose publicly - only enable it in dev mode to avoid exposing it by
+		// accident. There's benefit to having pprof enabled on production nodes, but we would likely want to expose it
+		// via a dedicated port listening on localhost, or via the admin UNIX socket.
+		if *unsafeDevMode {
+			// Pass requests to http.DefaultServeMux, which pprof automatically registers with as an import side-effect.
+			router.PathPrefix("/debug/pprof/").Handler(http.DefaultServeMux)
+		}
+
+		// Simple endpoint exposing node readiness (safe to expose to untrusted clients)
+		router.HandleFunc("/readyz", readiness.Handler)
 
-			logger.Info("debug server listening on [::]:6060")
-			logger.Error("debug server crashed", zap.Error(http.ListenAndServe("[::]:6060", nil)))
+		// Prometheus metrics (safe to expose to untrusted clients)
+		router.Handle("/metrics", promhttp.Handler())
+
+		go func() {
+			logger.Info("status server listening on [::]:6060")
+			logger.Error("status server crashed", zap.Error(http.ListenAndServe("[::]:6060", router)))
 		}()
+	}
 
+	// In devnet mode, we automatically set a number of flags that rely on deterministic keys.
+	if *unsafeDevMode {
 		g0key, err := peer.IDFromPrivateKey(devnet.DeterministicP2PPrivKeyByIndex(0))
 		if err != nil {
 			panic(err)

+ 2 - 1
bridge/go.mod

@@ -25,6 +25,7 @@ require (
 	github.com/golang/protobuf v1.4.3
 	github.com/google/gopacket v1.1.19 // indirect
 	github.com/google/uuid v1.1.5 // indirect
+	github.com/gorilla/mux v1.7.4
 	github.com/gorilla/websocket v1.4.2
 	github.com/hashicorp/errwrap v1.1.0 // indirect
 	github.com/imdario/mergo v0.3.11 // indirect
@@ -51,7 +52,7 @@ require (
 	github.com/olekukonko/tablewriter v0.0.4 // indirect
 	github.com/pborman/uuid v1.2.1 // indirect
 	github.com/peterh/liner v1.2.1 // indirect
-	github.com/prometheus/procfs v0.0.11 // indirect
+	github.com/prometheus/client_golang v1.9.0
 	github.com/prometheus/tsdb v0.10.0 // indirect
 	github.com/shirou/gopsutil v2.20.9+incompatible // indirect
 	github.com/spf13/cast v1.3.1 // indirect

+ 20 - 2
bridge/go.sum

@@ -82,6 +82,7 @@ github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuy
 github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
 github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4 h1:Hs82Z41s6SdL1CELW+XaDYmOH4hkBN4/N9og/AsOv7E=
 github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
+github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod h1:rBZYJk541a8SKzHPHnH3zbiI+7dagKZ0cgpgrD7Fyho=
 github.com/allegro/bigcache v1.2.1-0.20190218064605-e24eb225f156 h1:eMwmnE/GDgah4HI848JfFxHt+iPb26b4zyfspmqY0/8=
 github.com/allegro/bigcache v1.2.1-0.20190218064605-e24eb225f156/go.mod h1:Cb/ax3seSYIx7SuZdm2G2xzfwmv3TPSk2ucNfQESPXM=
 github.com/allegro/bigcache v1.2.1 h1:hg1sY1raCwic3Vnsvje6TT7/pnZba83LeFck5NrFKSc=
@@ -431,6 +432,7 @@ github.com/gorilla/context v1.1.1/go.mod h1:kBGZzfjB9CEq2AlWe17Uuf7NDRt0dE0s8S51
 github.com/gorilla/handlers v1.4.2/go.mod h1:Qkdc/uu4tH4g6mTK6auzZ766c4CA0Ng8+o/OAirnOIQ=
 github.com/gorilla/mux v1.6.2/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs=
 github.com/gorilla/mux v1.7.3/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs=
+github.com/gorilla/mux v1.7.4 h1:VuZ8uybHlWmqV03+zRzdwKL4tUnIp1MAQtp1mIFE1bc=
 github.com/gorilla/mux v1.7.4/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So=
 github.com/gorilla/rpc v1.2.0 h1:WvvdC2lNeT1SP32zrIce5l0ECBfbAlmrmSBsuc57wfk=
 github.com/gorilla/rpc v1.2.0/go.mod h1:V4h9r+4sF5HnzqbwIez0fKSpANP0zlYd3qR7p36jkTQ=
@@ -575,6 +577,7 @@ github.com/jmespath/go-jmespath v0.0.0-20180206201540-c2b33e8439af/go.mod h1:Nht
 github.com/jmhodges/levigo v1.0.0 h1:q5EC36kV79HWeTBWsod3mG11EgStG3qArTKcvlksN1U=
 github.com/jmhodges/levigo v1.0.0/go.mod h1:Q6Qx+uH3RAqyK4rFQroq9RL7mdkABMcfhEI+nNuzMJQ=
 github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo=
+github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4=
 github.com/jrick/logrotate v1.0.0/go.mod h1:LNinyqDIJnpAur+b8yyulnQw/wDuN1+BYKlTRt3OuAQ=
 github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU=
 github.com/json-iterator/go v1.1.7/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
@@ -588,6 +591,7 @@ github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7
 github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU=
 github.com/julienschmidt/httprouter v1.1.1-0.20170430222011-975b5c4c7c21/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w=
 github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w=
+github.com/julienschmidt/httprouter v1.3.0/go.mod h1:JR6WtHb+2LUe8TCKY3cZOxFyyO8IZAc4RVcycCCAKdM=
 github.com/kami-zh/go-capturer v0.0.0-20171211120116-e492ea43421d/go.mod h1:P2viExyCEfeWGU259JnaQ34Inuec4R38JCyBx2edgD0=
 github.com/karalabe/hid v1.0.0/go.mod h1:Vr51f8rUOLYrfrWDFlV12GGQgM5AT8sVh+2fY4MPeu8=
 github.com/karalabe/usb v0.0.0-20190919080040-51dc0efba356 h1:I/yrLt2WilKxlQKCM52clh5rGzTKpVctGT1lH4Dc8Jw=
@@ -609,6 +613,7 @@ github.com/klauspost/cpuid v1.2.3/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgo
 github.com/klauspost/reedsolomon v1.9.2/go.mod h1:CwCi+NUr9pqSVktrkN+Ondf06rkhYZ/pcNv7fu+8Un4=
 github.com/klauspost/reedsolomon v1.9.3/go.mod h1:CwCi+NUr9pqSVktrkN+Ondf06rkhYZ/pcNv7fu+8Un4=
 github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
+github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
 github.com/koron/go-ssdp v0.0.0-20191105050749-2e1c40ed0b5d h1:68u9r4wEvL3gYg2jvAOgROwZ3H+Y3hIDk4tbbmIjcYQ=
 github.com/koron/go-ssdp v0.0.0-20191105050749-2e1c40ed0b5d/go.mod h1:5Ky9EC2xfoUKUor0Hjgi2BJhCSXJfMOFlmyYrVKGQMk=
 github.com/koron/go-ssdp v0.0.2 h1:fL3wAoyT6hXHQlORyXUW4Q23kkQpJRgEAYcZB5BR71o=
@@ -993,6 +998,7 @@ github.com/multiformats/go-varint v0.0.6 h1:gk85QWKxh3TazbLxED/NlDVv8+q+ReFJk7Y2
 github.com/multiformats/go-varint v0.0.6/go.mod h1:3Ls8CIEsrijN6+B7PbrXRPxHRPuXSrVKRY101jdMZYE=
 github.com/munnerz/goautoneg v0.0.0-20120707110453-a547fc61f48d/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
 github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
+github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
 github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw=
 github.com/naoina/go-stringutil v0.1.0/go.mod h1:XJ2SJL9jCtBh+P9q5btrd/Ylo8XwT/h1USek5+NqSA0=
 github.com/naoina/toml v0.1.2-0.20170918210437-9fafd6967416/go.mod h1:NBIhNtsFMo3G2szEBne+bO4gS192HuIYRqfvOWb4i1E=
@@ -1090,6 +1096,9 @@ github.com/prometheus/client_golang v1.3.0/go.mod h1:hJaj2vgQTGQmVCsAACORcieXFeD
 github.com/prometheus/client_golang v1.4.1/go.mod h1:e9GMxYsXl05ICDXkRhurwBS4Q3OK1iX/F2sw+iXX5zU=
 github.com/prometheus/client_golang v1.5.1 h1:bdHYieyGlH+6OLEk2YQha8THib30KP0/yD0YH9m6xcA=
 github.com/prometheus/client_golang v1.5.1/go.mod h1:e9GMxYsXl05ICDXkRhurwBS4Q3OK1iX/F2sw+iXX5zU=
+github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M=
+github.com/prometheus/client_golang v1.9.0 h1:Rrch9mh17XcxvEu9D9DEpb4isxjGBtcevQjKvxPRQIU=
+github.com/prometheus/client_golang v1.9.0/go.mod h1:FqZLKOZnGdFAhOK4nqGHa7D66IdsO+O441Eve7ptJDU=
 github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
 github.com/prometheus/client_model v0.0.0-20190115171406-56726106282f/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
 github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
@@ -1106,6 +1115,9 @@ github.com/prometheus/common v0.6.0/go.mod h1:eBmuwkDJBwy6iBfxCBob6t6dR6ENT/y+J+
 github.com/prometheus/common v0.7.0/go.mod h1:DjGbpBbp5NYNiECxcL/VnbXCCaQpKd3tt26CguLLsqA=
 github.com/prometheus/common v0.9.1 h1:KOMtN28tlbam3/7ZKEYKHhKoJZYYj3gMH4uc62x7X7U=
 github.com/prometheus/common v0.9.1/go.mod h1:yhUN8i9wzaXS3w1O07YhxHEBxD+W35wd8bs7vj7HSQ4=
+github.com/prometheus/common v0.10.0/go.mod h1:Tlit/dnDKsSWFlCLTWaA1cyBgKHSMdTB80sz/V91rCo=
+github.com/prometheus/common v0.15.0 h1:4fgOnadei3EZvgRwxJ7RMpG1k1pOZth5Pc13tyspaKM=
+github.com/prometheus/common v0.15.0/go.mod h1:U+gB1OBLb1lF3O42bTCL+FK18tX9Oar16Clt/msog/s=
 github.com/prometheus/procfs v0.0.0-20180725123919-05ee40e3a273/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
 github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
 github.com/prometheus/procfs v0.0.0-20190117184657-bf6a532e95b1/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
@@ -1115,8 +1127,9 @@ github.com/prometheus/procfs v0.0.3/go.mod h1:4A/X28fw3Fc593LaREMrKMqOKvUAntwMDa
 github.com/prometheus/procfs v0.0.8/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+GxbHq6oeK9A=
 github.com/prometheus/procfs v0.0.10 h1:QJQN3jYQhkamO4mhfUWqdDH2asK7ONOI9MTWjyAxNKM=
 github.com/prometheus/procfs v0.0.10/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+GxbHq6oeK9A=
-github.com/prometheus/procfs v0.0.11 h1:DhHlBtkHWPYi8O2y31JkK0TF+DGM+51OopZjH/Ia5qI=
-github.com/prometheus/procfs v0.0.11/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU=
+github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU=
+github.com/prometheus/procfs v0.2.0 h1:wH4vA7pcjKuZzjF7lM8awk4fnuJO6idemZXoKnULUx4=
+github.com/prometheus/procfs v0.2.0/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU=
 github.com/prometheus/tsdb v0.6.2-0.20190402121629-4f204dcbc150 h1:ZeU+auZj1iNzN8iVhff6M38Mfu73FQiJve/GEXYJBjE=
 github.com/prometheus/tsdb v0.6.2-0.20190402121629-4f204dcbc150/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU=
 github.com/prometheus/tsdb v0.7.1/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU=
@@ -1176,6 +1189,7 @@ github.com/shurcooL/webdavfs v0.0.0-20170829043945-18c3829fa133/go.mod h1:hKmq5k
 github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
 github.com/sirupsen/logrus v1.4.2 h1:SPIRibHv4MatM3XXNO2BJeFLZwZ2LvZgfQ5+UNI2im4=
 github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE=
+github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88=
 github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d h1:zE9ykElWQ6/NYmHa3jpm/yHnI4xSofP+UP6SpjHcSeM=
 github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
 github.com/smartystreets/goconvey v1.6.4 h1:fv0U8FUIMPNf1L9lnHLvLhgicrIVChEkdzIKYqbNC9s=
@@ -1495,6 +1509,7 @@ golang.org/x/net v0.0.0-20200222125558-5a598a2470a0/go.mod h1:z5CRVTTTmAJ677TzLL
 golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
 golang.org/x/net v0.0.0-20200520004742-59133d7f0dd7/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A=
+golang.org/x/net v0.0.0-20200625001655-4c5254603344/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
 golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
 golang.org/x/net v0.0.0-20200813134508-3edf25e44fcc/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
 golang.org/x/net v0.0.0-20200822124328-c89045814202 h1:VvcQYSHwXgi7W+TpUR6A9g6Up98WAHf3f/ulnJ62IyA=
@@ -1580,13 +1595,16 @@ golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd h1:xhmwyvizuTgC2qz7ZlMluP20u
 golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200519105757-fe76b779f299/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200602225109-6fdc65e7d980/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200622214017-ed371f2e16b4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200625212154-ddb9806d33ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200814200057-3d37ad5750ed/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200824131525-c12d262b63d8 h1:AvbQYmiaaaza3cW3QXRyPo5kYgpFIzOAfeAAN7m3qQ4=
 golang.org/x/sys v0.0.0-20200824131525-c12d262b63d8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20201013132646-2da7054afaeb/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20201214210602-f9fddec55a1e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4 h1:myAQVi0cGEoqQVR5POX+8RR2mrocKqNN1hmeMqhX27k=
 golang.org/x/sys v0.0.0-20210119212857-b64e53b001e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/term v0.0.0-20201117132131-f5c789dd3221 h1:/ZHdbVpdR/jk3g30/d4yUL0JU9kksj8+F/bnQUVLGDM=

+ 15 - 0
bridge/pkg/processor/broadcast.go

@@ -2,6 +2,7 @@ package processor
 
 import (
 	"encoding/hex"
+	"github.com/prometheus/client_golang/prometheus"
 	"time"
 
 	ethcommon "github.com/ethereum/go-ethereum/common"
@@ -12,6 +13,18 @@ import (
 	"github.com/certusone/wormhole/bridge/pkg/vaa"
 )
 
+var (
+	observationsBroadcastTotal = prometheus.NewCounter(
+		prometheus.CounterOpts{
+			Name: "wormhole_observations_broadcast_total",
+			Help: "Total number of signed observations queued for broadcast",
+		})
+)
+
+func init() {
+	prometheus.MustRegister(observationsBroadcastTotal)
+}
+
 func (p *Processor) broadcastSignature(v *vaa.VAA, signature []byte) {
 	digest, err := v.SigningMsg()
 	if err != nil {
@@ -48,4 +61,6 @@ func (p *Processor) broadcastSignature(v *vaa.VAA, signature []byte) {
 
 	// Fast path for our own signature
 	go func() { p.obsvC <- &obsv }()
+
+	observationsBroadcastTotal.Inc()
 }

+ 42 - 0
bridge/pkg/processor/cleanup.go

@@ -2,14 +2,52 @@ package processor
 
 import (
 	"context"
+	"github.com/prometheus/client_golang/prometheus"
 	"time"
 
 	"go.uber.org/zap"
 )
 
+var (
+	aggregationStateEntries = prometheus.NewGauge(
+		prometheus.GaugeOpts{
+			Name: "wormhole_aggregation_state_entries",
+			Help: "Current number of aggregation state entries (including unexpired succeed ones)",
+		})
+	aggregationStateExpiration = prometheus.NewCounter(
+		prometheus.CounterOpts{
+			Name: "wormhole_aggregation_state_expirations_total",
+			Help: "Total number of expired submitted aggregation states",
+		})
+	aggregationStateTimeout = prometheus.NewCounter(
+		prometheus.CounterOpts{
+			Name: "wormhole_aggregation_state_timeout_total",
+			Help: "Total number of aggregation states expired due to timeout after exhausting retries",
+		})
+	aggregationStateRetries = prometheus.NewCounter(
+		prometheus.CounterOpts{
+			Name: "wormhole_aggregation_state_retries_total",
+			Help: "Total number of aggregation states queued for resubmission",
+		})
+	aggregationStateUnobserved = prometheus.NewCounter(
+		prometheus.CounterOpts{
+			Name: "wormhole_aggregation_state_unobserved_total",
+			Help: "Total number of aggregation states expired due to no matching local lockup observations",
+		})
+)
+
+func init() {
+	prometheus.MustRegister(aggregationStateEntries)
+	prometheus.MustRegister(aggregationStateExpiration)
+	prometheus.MustRegister(aggregationStateTimeout)
+	prometheus.MustRegister(aggregationStateRetries)
+	prometheus.MustRegister(aggregationStateUnobserved)
+}
+
 // handleCleanup handles periodic retransmissions and cleanup of VAAs
 func (p *Processor) handleCleanup(ctx context.Context) {
 	p.logger.Info("aggregation state summary", zap.Int("cached", len(p.state.vaaSignatures)))
+	aggregationStateEntries.Set(float64(len(p.state.vaaSignatures)))
 
 	for hash, s := range p.state.vaaSignatures {
 		delta := time.Now().Sub(s.firstObserved)
@@ -22,10 +60,12 @@ func (p *Processor) handleCleanup(ctx context.Context) {
 			// and then expired after a while (as noted in observation.go, this can be abused by a byzantine guardian).
 			p.logger.Info("expiring submitted VAA", zap.String("digest", hash), zap.Duration("delta", delta))
 			delete(p.state.vaaSignatures, hash)
+			aggregationStateExpiration.Inc()
 		case !s.submitted && s.retryCount >= 10:
 			// Clearly, this horse is dead and continued beatings won't bring it closer to quorum.
 			p.logger.Info("expiring unsubmitted VAA after exhausting retries", zap.String("digest", hash), zap.Duration("delta", delta))
 			delete(p.state.vaaSignatures, hash)
+			aggregationStateTimeout.Inc()
 		case !s.submitted && delta.Minutes() >= 5:
 			// Poor VAA has been unsubmitted for five minutes - clearly, something went wrong.
 			// If we have previously submitted an observation, we can make another attempt to get it over
@@ -39,9 +79,11 @@ func (p *Processor) handleCleanup(ctx context.Context) {
 					zap.Int("retry", 1))
 				p.sendC <- s.ourMsg
 				s.retryCount += 1
+				aggregationStateRetries.Inc()
 			} else {
 				p.logger.Info("expiring unsubmitted nil VAA", zap.String("digest", hash), zap.Duration("delta", delta))
 				delete(p.state.vaaSignatures, hash)
+				aggregationStateUnobserved.Inc()
 			}
 		}
 	}

+ 14 - 0
bridge/pkg/processor/injection.go

@@ -3,6 +3,7 @@ package processor
 import (
 	"context"
 	"encoding/hex"
+	"github.com/prometheus/client_golang/prometheus"
 
 	"github.com/ethereum/go-ethereum/crypto"
 	"go.uber.org/zap"
@@ -11,6 +12,18 @@ import (
 	"github.com/certusone/wormhole/bridge/pkg/vaa"
 )
 
+var (
+	vaaInjectionsTotal = prometheus.NewCounter(
+		prometheus.CounterOpts{
+			Name: "wormhole_vaa_injections_total",
+			Help: "Total number of injected VAA queued for broadcast",
+		})
+)
+
+func init() {
+	prometheus.MustRegister(vaaInjectionsTotal)
+}
+
 // handleInjection processes a pre-populated VAA injected locally.
 func (p *Processor) handleInjection(ctx context.Context, v *vaa.VAA) {
 	// Check if we're in the guardian set.
@@ -44,5 +57,6 @@ func (p *Processor) handleInjection(ctx context.Context, v *vaa.VAA) {
 		zap.String("signature", hex.EncodeToString(s)),
 		zap.Int("our_index", us))
 
+	vaaInjectionsTotal.Inc()
 	p.broadcastSignature(v, s)
 }

+ 35 - 1
bridge/pkg/processor/lockup.go

@@ -3,6 +3,7 @@ package processor
 import (
 	"context"
 	"encoding/hex"
+	"github.com/prometheus/client_golang/prometheus"
 
 	"github.com/ethereum/go-ethereum/crypto"
 	"go.uber.org/zap"
@@ -12,7 +13,32 @@ import (
 	"github.com/certusone/wormhole/bridge/pkg/vaa"
 )
 
-// handleLockup processes a lockup received from a chain and instantiates our deterministic copy of the VAA
+var (
+	// SECURITY: source_chain/target_chain are untrusted uint8 values. An attacker could cause a maximum of 255**2 label
+	// pairs to be created, which is acceptable.
+
+	lockupsObservedTotal = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Name: "wormhole_lockups_observed_total",
+			Help: "Total number of lockups received on-chain",
+		},
+		[]string{"source_chain", "target_chain"})
+
+	lockupsSignedTotal = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Name: "wormhole_lockups_signed_total",
+			Help: "Total number of lockups that were successfully signed",
+		},
+		[]string{"source_chain", "target_chain"})
+)
+
+func init() {
+	prometheus.MustRegister(lockupsObservedTotal)
+	prometheus.MustRegister(lockupsSignedTotal)
+}
+
+// handleLockup processes a lockup received from a chain and instantiates our deterministic copy of the VAA. A lockup
+// event may be received multiple times until it has been successfully completed.
 func (p *Processor) handleLockup(ctx context.Context, k *common.ChainLock) {
 	supervisor.Logger(ctx).Info("lockup confirmed",
 		zap.Stringer("source_chain", k.SourceChain),
@@ -26,6 +52,10 @@ func (p *Processor) handleLockup(ctx context.Context, k *common.ChainLock) {
 		zap.Time("timestamp", k.Timestamp),
 	)
 
+	lockupsObservedTotal.With(prometheus.Labels{
+		"source_chain": k.SourceChain.String(),
+		"target_chain": k.TargetChain.String()}).Add(1)
+
 	if p.gs == nil {
 		p.logger.Warn("received observation, but we don't know the guardian set yet")
 		return
@@ -83,5 +113,9 @@ func (p *Processor) handleLockup(ctx context.Context, k *common.ChainLock) {
 		zap.String("signature", hex.EncodeToString(s)),
 		zap.Int("our_index", us))
 
+	lockupsSignedTotal.With(prometheus.Labels{
+		"source_chain": k.SourceChain.String(),
+		"target_chain": k.TargetChain.String()}).Add(1)
+
 	p.broadcastSignature(v, s)
 }

+ 63 - 1
bridge/pkg/processor/observation.go

@@ -4,6 +4,7 @@ import (
 	"context"
 	"encoding/hex"
 	"fmt"
+	"github.com/prometheus/client_golang/prometheus"
 	"strings"
 	"time"
 
@@ -18,6 +19,48 @@ import (
 	"github.com/certusone/wormhole/bridge/pkg/vaa"
 )
 
+var (
+	observationsReceivedTotal = prometheus.NewCounter(
+		prometheus.CounterOpts{
+			Name: "wormhole_observations_received_total",
+			Help: "Total number of raw VAA observations received from gossip",
+		})
+	observationsReceivedByGuardianAddressTotal = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Name: "wormhole_observations_signed_by_guardian_total",
+			Help: "Total number of signed and verified VAA observations grouped by guardian address",
+		}, []string{"addr"})
+	observationsFailedTotal = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Name: "wormhole_observations_verification_failures_total",
+			Help: "Total number of observations verification failure, grouped by failure reason",
+		}, []string{"cause"})
+	observationsUnknownLockupTotal = prometheus.NewCounter(
+		prometheus.CounterOpts{
+			Name: "wormhole_observations_unknown_lockup_total",
+			Help: "Total number of verified VAA observations for a lockup we haven't seen yet",
+		})
+	observationsDirectSubmissionsTotal = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Name: "wormhole_observations_direct_submissions_queued_total",
+			Help: "Total number of observations for a specific target chain that were queued for direct submission",
+		}, []string{"target_chain"})
+	observationsDirectSubmissionSuccessTotal = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Name: "wormhole_observations_direct_submission_success_total",
+			Help: "Total number of observations for a specific target chain that succeeded",
+		}, []string{"target_chain"})
+)
+
+func init() {
+	prometheus.MustRegister(observationsReceivedTotal)
+	prometheus.MustRegister(observationsReceivedByGuardianAddressTotal)
+	prometheus.MustRegister(observationsFailedTotal)
+	prometheus.MustRegister(observationsUnknownLockupTotal)
+	prometheus.MustRegister(observationsDirectSubmissionsTotal)
+	prometheus.MustRegister(observationsDirectSubmissionSuccessTotal)
+}
+
 // handleObservation processes a remote VAA observation, verifies it, checks whether the VAA has met quorum,
 // and assembles and submits a valid VAA if possible.
 func (p *Processor) handleObservation(ctx context.Context, m *gossipv1.SignedObservation) {
@@ -31,6 +74,8 @@ func (p *Processor) handleObservation(ctx context.Context, m *gossipv1.SignedObs
 		zap.String("signature", hex.EncodeToString(m.Signature)),
 		zap.String("addr", hex.EncodeToString(m.Addr)))
 
+	observationsReceivedTotal.Inc()
+
 	// Verify the Guardian's signature. This verifies that m.Signature matches m.Hash and recovers
 	// the public key that was used to sign the payload.
 	pk, err := crypto.Ecrecover(m.Hash, m.Signature)
@@ -40,6 +85,7 @@ func (p *Processor) handleObservation(ctx context.Context, m *gossipv1.SignedObs
 			zap.String("signature", hex.EncodeToString(m.Signature)),
 			zap.String("addr", hex.EncodeToString(m.Addr)),
 			zap.Error(err))
+		observationsFailedTotal.WithLabelValues("invalid_signature").Inc()
 		return
 	}
 
@@ -53,6 +99,7 @@ func (p *Processor) handleObservation(ctx context.Context, m *gossipv1.SignedObs
 			zap.String("signature", hex.EncodeToString(m.Signature)),
 			zap.String("addr", hex.EncodeToString(m.Addr)),
 			zap.String("pk", signer_pk.Hex()))
+		observationsFailedTotal.WithLabelValues("pubkey_mismatch").Inc()
 		return
 	}
 
@@ -63,6 +110,7 @@ func (p *Processor) handleObservation(ctx context.Context, m *gossipv1.SignedObs
 			zap.String("their_addr", their_addr.Hex()),
 			zap.Any("current_set", p.gs.KeysAsHexStrings()),
 		)
+		observationsFailedTotal.WithLabelValues("unknown_guardian").Inc()
 		return
 	}
 
@@ -70,11 +118,15 @@ func (p *Processor) handleObservation(ctx context.Context, m *gossipv1.SignedObs
 	// a valid signature by an active guardian. We still don't fully trust them, as they may be
 	// byzantine, but now we know who we're dealing with.
 
+	// We can now count events by guardian without worry about cardinality explosions:
+	// TODO: add source_chain
+	observationsReceivedByGuardianAddressTotal.WithLabelValues(their_addr.Hex()).Inc()
+
 	// []byte isn't hashable in a map. Paying a small extra cost for encoding for easier debugging.
 	hash := hex.EncodeToString(m.Hash)
 
 	if p.state.vaaSignatures[hash] == nil {
-		// We haven't yet seen this event ourselves, and therefore  do not know what the VAA looks like.
+		// We haven't yet seen this event ourselves, and therefore do not know what the VAA looks like.
 		// However, we have established that a valid guardian has signed it, and therefore we can
 		// already start aggregating signatures for it.
 		//
@@ -82,6 +134,8 @@ func (p *Processor) handleObservation(ctx context.Context, m *gossipv1.SignedObs
 		// leading to a slow out-of-memory crash. We do not attempt to automatically mitigate spam attacks with valid
 		// signatures - such byzantine behavior would be plainly visible and would be dealt with by kicking them.
 
+		observationsUnknownLockupTotal.Inc()
+
 		p.state.vaaSignatures[hash] = &vaaState{
 			firstObserved: time.Now(),
 			signatures:    map[common.Address][]byte{},
@@ -204,6 +258,8 @@ func (p *Processor) handleObservation(ctx context.Context, m *gossipv1.SignedObs
 // have an Ethereum account and the user retrieves the VAA and submits the transactions themselves.
 func (p *Processor) devnetVAASubmission(ctx context.Context, signed *vaa.VAA, hash string) {
 	if p.devnetMode {
+		observationsDirectSubmissionsTotal.WithLabelValues("ethereum").Inc()
+
 		timeout, cancel := context.WithTimeout(ctx, 15*time.Second)
 		tx, err := devnet.SubmitVAA(timeout, p.devnetEthRPC, signed)
 		cancel()
@@ -217,6 +273,8 @@ func (p *Processor) devnetVAASubmission(ctx context.Context, signed *vaa.VAA, ha
 			}
 			return
 		}
+
+		observationsDirectSubmissionSuccessTotal.WithLabelValues("ethereum").Inc()
 		p.logger.Info("VAA submitted to Ethereum", zap.Any("tx", tx), zap.String("digest", hash))
 	}
 }
@@ -229,6 +287,8 @@ func (p *Processor) terraVAASubmission(ctx context.Context, signed *vaa.VAA, has
 		return
 	}
 
+	observationsDirectSubmissionsTotal.WithLabelValues("terra").Inc()
+
 	tx, err := terra.SubmitVAA(ctx, p.terraLCD, p.terraChaidID, p.terraContract, p.terraFeePayer, signed)
 	if err != nil {
 		if strings.Contains(err.Error(), "VaaAlreadyExecuted") {
@@ -240,5 +300,7 @@ func (p *Processor) terraVAASubmission(ctx context.Context, signed *vaa.VAA, has
 		}
 		return
 	}
+
+	observationsDirectSubmissionSuccessTotal.WithLabelValues("terra").Inc()
 	p.logger.Info("VAA submitted to Terra", zap.Any("tx", tx), zap.String("digest", hash))
 }

+ 1 - 2
bridge/pkg/solana/watcher.go

@@ -51,8 +51,7 @@ func (e *SolanaVAASubmitter) Run(ctx context.Context) error {
 	errC := make(chan error)
 	logger := supervisor.Logger(ctx)
 
-	// Check whether agent is up by doing a GetBalance call. This is a bit hacky, but otherwise, a broken agent won't
-	// fail until Recv(). Readiness is best-effort and if this succeeds, it's fair to assume that the watch does too.
+	// Check whether agent is up by doing a GetBalance call.
 	balance, err := c.GetBalance(timeout, &agentv1.GetBalanceRequest{})
 	if err != nil {
 		return fmt.Errorf("failed to get balance: %v", err)