Backport 5518 - Prevent node from marking itself as malicious (#5521)

* Prevent node from marking itself as malicious (#5518) The ATX handler should not create malfeasance proofs against the node itself. In case we have a bug somewhere that would publish 2 ATXs in a single epoch fail validation, but do not mark yourself as malicious
spacemeshos · Jan 31, 2024 · 2280341 · 2280341
1 parent f3538a4
commit 2280341
Show file tree

Hide file tree

Showing 4 changed files with 157 additions and 42 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,17 +2,22 @@
 
 See [RELEASE](./RELEASE.md) for workflow instructions.
 
-## UNRELEASED
+## Release v1.3.8
 
 ### Improvements
 
 * [#5441](https://github.com/spacemeshos/go-spacemesh/pull/5441)
   Fix possible nil-pointer dereference in blocks.Generator.
+
 * [#5512](https://github.com/spacemeshos/go-spacemesh/pull/5512)
   Increase EpochActiveSet limit to 1.5M to prepare for 1M+ ATXs.
+
 * [#5515](https://github.com/spacemeshos/go-spacemesh/pull/5515)
   Increase fetcher limit to 60MiB to prepare for 1M+ ATXs.
 
+* [#5518](https://github.com/spacemeshos/go-spacemesh/pull/5518) In rare cases the node could create a malfeasance
+  proof against itself. This is now prevented.
+
 ## Release v1.3.7
 
 ### Improvements
@@ -39,10 +44,14 @@ See [RELEASE](./RELEASE.md) for workflow instructions.
 * [#5498](https://github.com/spacemeshos/go-spacemesh/pull/5498)
   Reduce the default number of CPU cores that are used for verifying incoming ATXs to half of the available cores.
 
-* [#5500](https://github.com/spacemeshos/go-spacemesh/pull/5500)
-  Make fetch request timeout configurable.
-  Add separate metric for failed p2p server requests.
+* [#5462](https://github.com/spacemeshos/go-spacemesh/pull/5462) Add separate metric for failed p2p server requests
+
+* [#5464](https://github.com/spacemeshos/go-spacemesh/pull/5464) Make fetch request timeout configurable.
+
+* [#5463](https://github.com/spacemeshos/go-spacemesh/pull/5463)
   Adjust deadline during long reads and writes, reducing "i/o deadline exceeded" errors.
+
+* [#5494](https://github.com/spacemeshos/go-spacemesh/pull/5494)
   Make routing discovery more configurable and less spammy by default.
 
 ## Release v1.3.5
@@ -208,6 +217,13 @@ for more information on how to configure the node to work with the PoST service.
 * [#5384](https://github.com/spacemeshos/go-spacemesh/pull/5384) to improve network stability and performance allow the
   active set to be set in advance for an epoch. This allows the network to start consensus on the first layer of an epoch.
 
+## Release v1.2.13
+
+### Improvements
+
+* [#5384](https://github.com/spacemeshos/go-spacemesh/pull/5384) to improve network stability and performance allow the
+  active set to be set in advance for an epoch.
+
 ## Release v1.2.12
 
 ### Improvements
@@ -230,6 +246,15 @@ for more information on how to configure the node to work with the PoST service.
 
 * further increased cache sizes and and p2p timeouts to compensate for the increased number of nodes on the network.
 
+* [#5329](https://github.com/spacemeshos/go-spacemesh/pull/5329) P2P decentralization improvements. Added support for QUIC
+  transport and DHT routing discovery for finding peers and relays. Also, added the `ping-peers` feature which is useful
+  during connectivity troubleshooting. `static-relays` feature can be used to provide a static list of circuit v2 relays
+  nodes when automatic relay discovery is not desired. All of the relay server resource settings are now configurable. Most
+  of the new functionality is disabled by default unless explicitly enabled in the config via `enable-routing-discovery`,
+  `routing-discovery-advertise`, `enable-quic-transport`, `static-relays` and `ping-peers` options in the `p2p` config
+  section. The non-conditional changes include values/provides support on all of the nodes, which will enable DHT to
+  function efficiently for routing discovery.
+
 ## Release v1.2.9
 
 ### Improvements
@@ -262,7 +287,6 @@ for more information on how to configure the node to work with the PoST service.
 ### Improvements
 
 * [#5263](https://github.com/spacemeshos/go-spacemesh/pull/5263) randomize peer selection
-
   without this change node can get stuck after restart on requesting data from peer that is misbehaving.
   log below will be printed repeatedly:
 

diff --git a/activation/handler.go b/activation/handler.go
@@ -8,6 +8,7 @@ import (
 	"time"
 
 	"github.com/spacemeshos/post/shared"
+	"go.uber.org/zap"
 	"golang.org/x/exp/maps"
 
 	"github.com/spacemeshos/go-spacemesh/atxsdata"
@@ -50,6 +51,9 @@ type Handler struct {
 	fetcher         system.Fetcher
 	poetCfg         PoetConfig
 
+	signerMtx sync.Mutex
+	signers   map[types.NodeID]*signing.EdSigner
+
 	// inProgress map gathers ATXs that are currently being processed.
 	// It's used to avoid processing the same ATX twice.
 	inProgress   map[types.ATXID][]chan error
@@ -89,10 +93,23 @@ func NewHandler(
 		tortoise:        tortoise,
 		poetCfg:         poetCfg,
 
+		signers:    make(map[types.NodeID]*signing.EdSigner),
 		inProgress: make(map[types.ATXID][]chan error),
 	}
 }
 
+func (h *Handler) Register(sig *signing.EdSigner) {
+	h.signerMtx.Lock()
+	defer h.signerMtx.Unlock()
+	if _, exists := h.signers[sig.NodeID()]; exists {
+		h.log.Error("signing key already registered", zap.Stringer("id", sig.NodeID()))
+		return
+	}
+
+	h.log.Info("registered signing key", zap.Stringer("id", sig.NodeID()))
+	h.signers[sig.NodeID()] = sig
+}
+
 // ProcessAtx validates the active set size declared in the atx, and contextually validates the atx according to atx
 // validation rules it then stores the atx with flag set to validity of the atx.
 //
@@ -372,48 +389,61 @@ func (h *Handler) storeAtx(ctx context.Context, atx *types.VerifiedActivationTx)
 		return fmt.Errorf("checking if node is malicious: %w", err)
 	}
 	var proof *types.MalfeasanceProof
-	if err := h.cdb.WithTx(ctx, func(dbtx *sql.Tx) error {
-		if !malicious {
-			prev, err := atxs.GetByEpochAndNodeID(dbtx, atx.PublishEpoch, atx.SmesherID)
-			if err != nil && !errors.Is(err, sql.ErrNotFound) {
-				return err
+	if err := h.cdb.WithTx(ctx, func(tx *sql.Tx) error {
+		if malicious {
+			if err := atxs.Add(tx, atx); err != nil && !errors.Is(err, sql.ErrObjectExists) {
+				return fmt.Errorf("add atx to db: %w", err)
 			}
-			// do ID check to be absolutely sure.
-			if prev != nil && prev.ID() != atx.ID() {
-				var atxProof types.AtxProof
-				for i, a := range []*types.VerifiedActivationTx{prev, atx} {
-					atxProof.Messages[i] = types.AtxProofMsg{
-						InnerMsg: types.ATXMetadata{
-							PublishEpoch: a.PublishEpoch,
-							MsgHash:      types.BytesToHash(a.HashInnerBytes()),
-						},
-						SmesherID: a.SmesherID,
-						Signature: a.Signature,
-					}
-				}
-				proof = &types.MalfeasanceProof{
-					Layer: atx.PublishEpoch.FirstLayer(),
-					Proof: types.Proof{
-						Type: types.MultipleATXs,
-						Data: &atxProof,
+			return nil
+		}
+
+		prev, err := atxs.GetByEpochAndNodeID(tx, atx.PublishEpoch, atx.SmesherID)
+		if err != nil && !errors.Is(err, sql.ErrNotFound) {
+			return err
+		}
+
+		// do ID check to be absolutely sure.
+		if prev != nil && prev.ID() != atx.ID() {
+			if _, ok := h.signers[atx.SmesherID]; ok {
+				// if we land here we tried to publish 2 ATXs in the same epoch
+				// don't punish ourselves but fail validation and thereby the handling of the incoming ATX
+				return fmt.Errorf("%s already published an ATX in epoch %d", atx.SmesherID.ShortString(), atx.PublishEpoch)
+			}
+
+			var atxProof types.AtxProof
+			for i, a := range []*types.VerifiedActivationTx{prev, atx} {
+				atxProof.Messages[i] = types.AtxProofMsg{
+					InnerMsg: types.ATXMetadata{
+						PublishEpoch: a.PublishEpoch,
+						MsgHash:      types.BytesToHash(a.HashInnerBytes()),
 					},
+					SmesherID: a.SmesherID,
+					Signature: a.Signature,
 				}
-				encoded, err := codec.Encode(proof)
-				if err != nil {
-					h.log.With().Panic("failed to encode malfeasance proof", log.Err(err))
-				}
-				if err := identities.SetMalicious(dbtx, atx.SmesherID, encoded, time.Now()); err != nil {
-					return fmt.Errorf("add malfeasance proof: %w", err)
-				}
-
-				h.log.WithContext(ctx).With().Warning("smesher produced more than one atx in the same epoch",
-					log.Stringer("smesher", atx.SmesherID),
-					log.Object("prev", prev),
-					log.Object("curr", atx),
-				)
 			}
+			proof = &types.MalfeasanceProof{
+				Layer: atx.PublishEpoch.FirstLayer(),
+				Proof: types.Proof{
+					Type: types.MultipleATXs,
+					Data: &atxProof,
+				},
+			}
+			encoded, err := codec.Encode(proof)
+			if err != nil {
+				h.log.With().Panic("failed to encode malfeasance proof", log.Err(err))
+			}
+			if err := identities.SetMalicious(tx, atx.SmesherID, encoded, time.Now()); err != nil {
+				return fmt.Errorf("add malfeasance proof: %w", err)
+			}
+
+			h.log.WithContext(ctx).With().Warning("smesher produced more than one atx in the same epoch",
+				log.Stringer("smesher", atx.SmesherID),
+				log.Object("prev", prev),
+				log.Object("curr", atx),
+			)
 		}
-		if err := atxs.Add(dbtx, atx); err != nil && !errors.Is(err, sql.ErrObjectExists) {
+
+		if err := atxs.Add(tx, atx); err != nil && !errors.Is(err, sql.ErrObjectExists) {
 			return fmt.Errorf("add atx to db: %w", err)
 		}
 		return nil

diff --git a/activation/handler_test.go b/activation/handler_test.go
@@ -1000,6 +1000,66 @@ func TestHandler_ProcessAtx(t *testing.T) {
 	require.Equal(t, atx2.PublishEpoch.FirstLayer(), got.MalfeasanceProof.Layer)
 }
 
+func TestHandler_ProcessAtx_OwnNotMalicious(t *testing.T) {
+	// Arrange
+	goldenATXID := types.ATXID{2, 3, 4}
+	atxHdlr := newTestHandler(t, goldenATXID)
+
+	sig, err := signing.NewEdSigner()
+	require.NoError(t, err)
+	atxHdlr.Register(sig)
+
+	coinbase := types.GenerateAddress([]byte("aaaa"))
+
+	// Act & Assert
+	atx1 := newActivationTx(
+		t,
+		sig,
+		0,
+		types.EmptyATXID,
+		types.EmptyATXID,
+		nil,
+		types.LayerID(layersPerEpoch).GetEpoch(),
+		0,
+		100,
+		coinbase,
+		100,
+		&types.NIPost{},
+	)
+	atxHdlr.mbeacon.EXPECT().OnAtx(gomock.Any())
+	atxHdlr.mtortoise.EXPECT().OnAtx(gomock.Any())
+	require.NoError(t, atxHdlr.ProcessAtx(context.Background(), atx1))
+
+	// processing an already stored ATX returns no error
+	require.NoError(t, atxHdlr.ProcessAtx(context.Background(), atx1))
+	proof, err := identities.GetMalfeasanceProof(atxHdlr.cdb, sig.NodeID())
+	require.ErrorIs(t, err, sql.ErrNotFound)
+	require.Nil(t, proof)
+
+	// another atx for the same epoch is considered malicious
+	atx2 := newActivationTx(
+		t,
+		sig,
+		1,
+		atx1.ID(),
+		atx1.ID(),
+		nil,
+		types.LayerID(layersPerEpoch+1).GetEpoch(),
+		0,
+		100,
+		coinbase,
+		100,
+		&types.NIPost{},
+	)
+	require.ErrorContains(t,
+		atxHdlr.ProcessAtx(context.Background(), atx2),
+		fmt.Sprintf("%s already published an ATX", sig.NodeID().ShortString()),
+	)
+	proof, err = identities.GetMalfeasanceProof(atxHdlr.cdb, sig.NodeID())
+	require.ErrorIs(t, err, sql.ErrNotFound)
+	require.Nil(t, proof)
+}
+
 func TestHandler_ProcessAtxStoresNewVRFNonce(t *testing.T) {
 	// Arrange
 	goldenATXID := types.ATXID{2, 3, 4}

diff --git a/node/node.go b/node/node.go
@@ -717,6 +717,7 @@ func (app *App) initServices(ctx context.Context) error {
 		app.addLogger(ATXHandlerLogger, lg),
 		app.Config.POET,
 	)
+	atxHandler.Register(app.edSgn)
 
 	// we can't have an epoch offset which is greater/equal than the number of layers in an epoch