Selaa lähdekoodia

node/pkg/processor: expire late observations

Fixes https://github.com/certusone/wormhole/issues/685.

Example occurrence this fixes: https://i.imgur.com/gZWKf1n.png

Possible future optimizations include:

- Ignore late messages in the processor (but we can only ignore
  them post settlement time, so we need the cleanup logic regardless).

- Ignoring late observations from other nodes.

- Using the stored VAA to calculate misses.

- Drop incomplete local observations. However, this is not trivial
  since we do not know the message ID for those.

commit-id:47e1e59f
Leo 3 vuotta sitten
vanhempi
sitoutus
68bdd4b0b6
1 muutettua tiedostoa jossa 29 lisäystä ja 0 poistoa
  1. 29 0
      node/pkg/processor/cleanup.go

+ 29 - 0
node/pkg/processor/cleanup.go

@@ -4,6 +4,7 @@ import (
 	"context"
 	"encoding/hex"
 	"github.com/certusone/wormhole/node/pkg/common"
+	"github.com/certusone/wormhole/node/pkg/db"
 	"github.com/certusone/wormhole/node/pkg/vaa"
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/prometheus/client_golang/prometheus/promauto"
@@ -23,6 +24,11 @@ var (
 			Name: "wormhole_aggregation_state_expirations_total",
 			Help: "Total number of expired submitted aggregation states",
 		})
+	aggregationStateLate = promauto.NewCounter(
+		prometheus.CounterOpts{
+			Name: "wormhole_aggregation_state_late_total",
+			Help: "Total number of late aggregation states (cluster achieved consensus without us)",
+		})
 	aggregationStateTimeout = promauto.NewCounter(
 		prometheus.CounterOpts{
 			Name: "wormhole_aggregation_state_timeout_total",
@@ -58,6 +64,29 @@ func (p *Processor) handleCleanup(ctx context.Context) {
 		delta := time.Since(s.firstObserved)
 
 		switch {
+		case !s.submitted && s.ourVAA != nil && delta > settlementTime:
+			// Expire pending VAAs post settlement time if we have a stored quorum VAA.
+			//
+			// This occurs when we observed a message after the cluster has already reached
+			// consensus on it, causing us to never achieve quorum.
+
+			if _, err := p.db.GetSignedVAABytes(*db.VaaIDFromVAA(s.ourVAA)); err == nil {
+				// If we have a stored quorum VAA, we can safely expire the state.
+				//
+				// This is a rare case, and we can safely expire the state, since we
+				// have a quorum VAA.
+				p.logger.Info("Expiring late VAA", zap.String("digest", hash), zap.Duration("delta", delta))
+				aggregationStateLate.Inc()
+				delete(p.state.vaaSignatures, hash)
+				break
+			} else if err != db.ErrVAANotFound {
+				p.logger.Error("failed to look up VAA in database",
+					zap.String("digest", hash),
+					zap.Error(err),
+				)
+			}
+
+			fallthrough
 		case !s.settled && delta > settlementTime:
 			// After 30 seconds, the VAA is considered settled - it's unlikely that more observations will
 			// arrive, barring special circumstances. This is a better time to count misses than submission,