diff --git a/.github/README.md b/.github/README.md index ff2a266..8692905 100644 --- a/.github/README.md +++ b/.github/README.md @@ -45,10 +45,6 @@ These values ensure that Aside from the optimized syncing process, pruning already validated data is the second role of the supervysor to fulfill its goal of reducing disk storage requirements. Therefore, a custom pruning method is used, which relies on the provided Tendermint functionality of pruning all blocks until a specified height. In the context of the supervysor, this until-height should always be lower than the latest validated height of the KYVE data pool to ensure no data is pruned that needs validation. Unfortunately, the node has to be stopped to execute the pruning process, while a pruning-interval needs specification in hours. During this interval, the supervysor halts the current node process, prunes all validated blocks, and restarts the node. Due to the required time to connect with peers and to prevent the pool from catching up with the node, the pruning process is only initiated if the node is in GhostMode. If the node is in NormalMode, even if the interval reaches the pruning threshold, pruning will be enabled immediately after the node enters GhostMode. Additionally, it is recommended to set the pruning-interval to a value of at least six hours to ensure there is enough time to find peers before the pool catches up. -This ensures that -* only the required blocks for the next 2 days are kept locally, everything else will be pruned, -* because `min_retain_blocks > height_difference_max`, nothing will be pruned before it was validated in the data pool. - ## Requirements The supervysor manages the process of the data source node. First of all, it should be ensured that this node can run successfully, which can be tested by trying to sync the first `n` blocks. In addition, to successfully participate in a KYVE data pool, it is necessary to create a protocol validator and join a data pool. Further information can be found here: https://docs.kyve.network/validators/protocol_nodes/overview @@ -99,9 +95,9 @@ To use the supervysor, you first need to initialize it: ```bash supervysor init ---binary-path string 'path to chain binaries (e.g. ~/go/bin/osmosisd)' +--binary string 'path to chain binaries (e.g. ~/go/bin/osmosisd)' --chain-id string 'KYVE chain-id' ---home-path string 'path to home directory (e.g. ~/.osmosisd)' +--home string 'path to home directory (e.g. ~/.osmosisd)' --metrics string 'exposing Prometheus metrics ("true" or "false")' --pool-id int 'KYVE pool-id' --seeds string 'seeds for the node to connect' @@ -136,9 +132,9 @@ With your node being able to run using Cosmovisor, you can stop the process and ```bash supervysor init \ ---binary-path '/root/go/bin/cosmovisor' \ +--binary '/root/go/bin/cosmovisor' \ --chain-id 'kyve-1' \ ---home-path '/root/.osmosisd' \ +--home '/root/.osmosisd' \ --pool-id 1 \ --seeds '6bcdbcfd5d2c6ba58460f10dbcfde58278212833@osmosis.artifact-staking.io:26656,ade4d8bc8cbe014af6ebdf3cb7b1e9ad36f412c0@seeds.polkachu.com:12556' ``` diff --git a/Makefile b/Makefile index 555c50b..3213ca1 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -VERSION := v0.2.2 +VERSION := v0.2.3 ldflags := $(LDFLAGS) ldflags += -X main.Version=$(VERSION) diff --git a/backup/backup.go b/backup/backup.go index 7b51b69..1c0aecf 100644 --- a/backup/backup.go +++ b/backup/backup.go @@ -7,7 +7,6 @@ import ( "os/exec" "path/filepath" "sort" - "strings" ) func ClearBackups(srcPath string, threshold int) error { @@ -20,10 +19,7 @@ func ClearBackups(srcPath string, threshold int) error { backups := []os.DirEntry{} for _, entry := range entries { if entry.IsDir() { - // Make sure to only clear timestamped backups - if strings.HasPrefix(entry.Name(), "20") && len(entry.Name()) == 15 { - backups = append(backups, entry) - } + backups = append(backups, entry) } } diff --git a/cmd/supervysor/backup.go b/cmd/supervysor/backup.go index 823326d..8de3e6b 100644 --- a/cmd/supervysor/backup.go +++ b/cmd/supervysor/backup.go @@ -2,6 +2,9 @@ package main import ( "fmt" + "path/filepath" + + "github.com/KYVENetwork/supervysor/store" "github.com/KYVENetwork/supervysor/backup" "github.com/KYVENetwork/supervysor/cmd/supervysor/helpers" @@ -12,16 +15,15 @@ var ( compressionType string destPath string maxBackups int - srcPath string ) func init() { - backupCmd.Flags().StringVar(&srcPath, "src-path", "", "source path of the directory to backup") - if err := backupCmd.MarkFlagRequired("src-path"); err != nil { + backupCmd.Flags().StringVar(&home, "home", "", "path to home directory (e.g. /root/.osmosisd)") + if err := backupCmd.MarkFlagRequired("home"); err != nil { panic(fmt.Errorf("flag 'src-path' should be required: %w", err)) } - backupCmd.Flags().StringVar(&destPath, "dest-path", "", "destination path of the written backup (default '~/.ksync/backups)'") + backupCmd.Flags().StringVar(&destPath, "dest-path", "", "destination path of the written backup (default '~/.supervysor/backups)'") backupCmd.Flags().StringVar(&compressionType, "compression", "", "compression type to compress backup directory ['tar.gz', 'zip', '']") @@ -38,14 +40,33 @@ var backupCmd = &cobra.Command{ return } + config, err := helpers.LoadConfig(home) + if err != nil { + logger.Error("failed to load tendermint config", "err", err) + return + } + + // Load block store + blockStoreDB, blockStore, err := store.GetBlockstoreDBs(config) + if err != nil { + logger.Error("failed to get blockstore dbs") + return + } + defer blockStoreDB.Close() + if destPath == "" { - d, err := helpers.CreateDestPath(backupDir) + logger.Info("height", "h", blockStore.Height()) + d, err := helpers.CreateDestPath(backupDir, blockStore.Height()) if err != nil { + logger.Error("could not create destination path", "err", err) return } destPath = d } + // Only backup data directory + srcPath := filepath.Join(home, "data") + if err := helpers.ValidatePaths(srcPath, destPath); err != nil { return } diff --git a/cmd/supervysor/helpers/helpers.go b/cmd/supervysor/helpers/helpers.go index f8b9128..2f09d30 100644 --- a/cmd/supervysor/helpers/helpers.go +++ b/cmd/supervysor/helpers/helpers.go @@ -5,7 +5,7 @@ import ( "net/http" "os" "path/filepath" - "time" + "strconv" "github.com/spf13/viper" @@ -15,16 +15,15 @@ import ( cfg "github.com/tendermint/tendermint/config" ) -func CreateDestPath(backupDir string) (string, error) { - t := time.Now().Format("20060102_150405") - - if err := os.Mkdir(filepath.Join(backupDir, t), 0o755); err != nil { +func CreateDestPath(backupDir string, latestHeight int64) (string, error) { + if err := os.Mkdir(filepath.Join(backupDir, strconv.FormatInt(latestHeight, 10)), 0o755); err != nil { return "", fmt.Errorf("error creating backup directory: %v", err) } - if err := os.Mkdir(filepath.Join(backupDir, t, "data"), 0o755); err != nil { + fmt.Println(filepath.Join(backupDir, strconv.FormatInt(latestHeight, 10))) + if err := os.Mkdir(filepath.Join(backupDir, strconv.FormatInt(latestHeight, 10), "data"), 0o755); err != nil { return "", fmt.Errorf("error creating data backup directory: %v", err) } - return filepath.Join(backupDir, t, "data"), nil + return filepath.Join(backupDir, strconv.FormatInt(latestHeight, 10), "data"), nil } func GetDirectorySize(dirPath string) (float64, error) { @@ -148,11 +147,11 @@ func NewMetrics(reg prometheus.Registerer) *types.Metrics { return m } -func StartMetricsServer(reg *prometheus.Registry) error { +func StartMetricsServer(reg *prometheus.Registry, port int) error { // Create metrics endpoint promHandler := promhttp.HandlerFor(reg, promhttp.HandlerOpts{}) http.Handle("/metrics", promHandler) - err := http.ListenAndServe(":26660", nil) + err := http.ListenAndServe(fmt.Sprintf(":%v", port), nil) if err != nil { return err } diff --git a/cmd/supervysor/init.go b/cmd/supervysor/init.go index c766793..d67c6b9 100644 --- a/cmd/supervysor/init.go +++ b/cmd/supervysor/init.go @@ -5,6 +5,8 @@ import ( "fmt" "os" + "golang.org/x/exp/slices" + "github.com/KYVENetwork/supervysor/cmd/supervysor/helpers" "github.com/KYVENetwork/supervysor/types" @@ -17,11 +19,12 @@ import ( var ( abciEndpoint string - binaryPath string + binary string chainId string fallbackEndpoints string - homePath string + home string metrics bool + metricsPort int poolId int seeds string pruningInterval int @@ -30,18 +33,13 @@ var ( ) func init() { - initCmd.Flags().StringVar(&chainId, "chain-id", "", "KYVE chain-id") - if err := initCmd.MarkFlagRequired("chain-id"); err != nil { - panic(fmt.Errorf("flag 'chain-id' should be required: %w", err)) - } - - initCmd.Flags().StringVar(&binaryPath, "binary-path", "", "path to chain binaries or cosmovisor (e.g. /root/go/bin/cosmovisor)") - if err := initCmd.MarkFlagRequired("binary-path"); err != nil { + initCmd.Flags().StringVar(&binary, "binary", "", "path to chain binaries or cosmovisor (e.g. /root/go/bin/cosmovisor)") + if err := initCmd.MarkFlagRequired("binary"); err != nil { panic(fmt.Errorf("flag 'binary-path' should be required: %w", err)) } - initCmd.Flags().StringVar(&homePath, "home-path", "", "path to home directory (e.g. /root/.osmosisd)") - if err := initCmd.MarkFlagRequired("home-path"); err != nil { + initCmd.Flags().StringVar(&home, "home", "", "path to home directory (e.g. /root/.osmosisd)") + if err := initCmd.MarkFlagRequired("home"); err != nil { panic(fmt.Errorf("flag 'home-path' should be required: %w", err)) } @@ -55,12 +53,16 @@ func init() { panic(fmt.Errorf("flag 'seeds' should be required: %w", err)) } + initCmd.Flags().StringVar(&chainId, "chain-id", "kyve-1", "KYVE chain-id") + initCmd.Flags().StringVar(&fallbackEndpoints, "fallback-endpoints", "", "additional endpoints to query KYVE pool height") initCmd.Flags().IntVar(&pruningInterval, "pruning-interval", 24, "block-pruning interval (hours)") initCmd.Flags().BoolVar(&metrics, "metrics", true, "exposing Prometheus metrics (true or false)") + initCmd.Flags().IntVar(&metricsPort, "metrics-port", 26660, "port for metrics server") + initCmd.Flags().StringVar(&abciEndpoint, "abci-endpoint", "http://127.0.0.1:26657", "ABCI Endpoint to request node information") } @@ -68,84 +70,86 @@ var initCmd = &cobra.Command{ Use: "init", Short: "Initialize supervysor", RunE: func(cmd *cobra.Command, args []string) error { - return InitializeSupervysor() - }, -} - -// InitializeSupervysor initializes the required supervysor config and performs some basic checks. -func InitializeSupervysor() error { - if homePath == "" { - logger.Error("home directory can not be empty") - return fmt.Errorf("empty home directory path") - } - - if pruningInterval <= 6 { - logger.Error("pruning-interval should be higher than 6 hours") - } - - if err := settings.InitializeSettings(binaryPath, homePath, poolId, false, seeds, chainId, fallbackEndpoints); err != nil { - logger.Error("could not initialize settings", "err", err) - return err - } - logger.Info("successfully initialized settings") - - configPath, err := helpers.GetSupervysorDir() - if err != nil { - logger.Error("could not get supervysor directory path", "err", err) - return err - } + supportedChains := []string{"kyve-1", "kaon-1", "korellia", "korellia-2"} + if !slices.Contains(supportedChains, chainId) { + logger.Error("specified chain-id is not supported", "chain-id", chainId) + return fmt.Errorf("not supported chain-id") + } - if _, err = os.Stat(configPath + "/config.toml"); err == nil { - logger.Info(fmt.Sprintf("supervysor was already initialized and is editable under %s/config.toml", configPath)) - return nil - } else if errors.Is(err, os.ErrNotExist) { - if _, err = os.Stat(configPath); errors.Is(err, os.ErrNotExist) { - err = os.Mkdir(configPath, 0o755) - if err != nil { - return err - } + if home == "" { + logger.Error("home directory can not be empty") + return fmt.Errorf("empty home directory path") } - logger.Info("initializing supverysor...") - - config := types.SupervysorConfig{ - ABCIEndpoint: abciEndpoint, - BinaryPath: binaryPath, - ChainId: chainId, - HomePath: homePath, - Interval: 10, - Metrics: metrics, - PoolId: poolId, - Seeds: seeds, - FallbackEndpoints: fallbackEndpoints, - PruningInterval: pruningInterval, - HeightDifferenceMax: settings.Settings.MaxDifference, - HeightDifferenceMin: settings.Settings.MaxDifference / 2, - StateRequests: false, + + if pruningInterval <= 6 { + logger.Error("pruning-interval should be higher than 6 hours") } - b, err := toml.Marshal(config) - if err != nil { - logger.Error("could not unmarshal config", "err", err) + + if err := settings.InitializeSettings(binary, home, poolId, false, seeds, chainId, fallbackEndpoints); err != nil { + logger.Error("could not initialize settings", "err", err) return err } + logger.Info("successfully initialized settings") - err = os.WriteFile(configPath+"/config.toml", b, 0o755) + configPath, err := helpers.GetSupervysorDir() if err != nil { - logger.Error("could not write config file", "err", err) + logger.Error("could not get supervysor directory path", "err", err) return err } - _, err = getSupervysorConfig() - if err != nil { - logger.Error("could not load config file", "err", err) + if _, err = os.Stat(configPath + "/config.toml"); err == nil { + logger.Info(fmt.Sprintf("supervysor was already initialized and is editable under %s/config.toml", configPath)) + return nil + } else if errors.Is(err, os.ErrNotExist) { + if _, err = os.Stat(configPath); errors.Is(err, os.ErrNotExist) { + err = os.Mkdir(configPath, 0o755) + if err != nil { + return err + } + } + logger.Info("initializing supverysor...") + + config := types.SupervysorConfig{ + ABCIEndpoint: abciEndpoint, + BinaryPath: binary, + ChainId: chainId, + FallbackEndpoints: fallbackEndpoints, + HeightDifferenceMax: settings.Settings.MaxDifference, + HeightDifferenceMin: settings.Settings.MaxDifference / 2, + HomePath: home, + Interval: 10, + Metrics: metrics, + MetricsPort: metricsPort, + PoolId: poolId, + PruningInterval: pruningInterval, + Seeds: seeds, + StateRequests: false, + } + b, err := toml.Marshal(config) + if err != nil { + logger.Error("could not unmarshal config", "err", err) + return err + } + + err = os.WriteFile(configPath+"/config.toml", b, 0o755) + if err != nil { + logger.Error("could not write config file", "err", err) + return err + } + + _, err = getSupervysorConfig() + if err != nil { + logger.Error("could not load config file", "err", err) + return err + } + + logger.Info(fmt.Sprintf("successfully initialized: config available at %s/config.toml", configPath)) + return nil + } else { + logger.Error("could not get supervysor directory") return err } - - logger.Info(fmt.Sprintf("successfully initialized: config available at %s/config.toml", configPath)) - return nil - } else { - logger.Error("could not get supervysor directory") - return err - } + }, } // getSupervysorConfig returns the supervysor config.toml file. diff --git a/cmd/supervysor/prune.go b/cmd/supervysor/prune.go index ba60894..7fc7ff2 100644 --- a/cmd/supervysor/prune.go +++ b/cmd/supervysor/prune.go @@ -10,7 +10,7 @@ import ( var untilHeight int64 func init() { - pruneCmd.Flags().StringVar(&homePath, "home", "", "home directory") + pruneCmd.Flags().StringVar(&home, "home", "", "home directory") if err := pruneCmd.MarkFlagRequired("home"); err != nil { panic(fmt.Errorf("flag 'home' should be required: %w", err)) } @@ -25,7 +25,7 @@ var pruneCmd = &cobra.Command{ Use: "prune-blocks", Short: "Prune blocks until a specific height", Run: func(cmd *cobra.Command, args []string) { - if err := store.PruneBlocks(homePath, untilHeight, logger); err != nil { + if err := store.PruneBlocks(home, untilHeight, logger); err != nil { logger.Error(err.Error()) } }, diff --git a/cmd/supervysor/start.go b/cmd/supervysor/start.go index de98661..8ed38ca 100644 --- a/cmd/supervysor/start.go +++ b/cmd/supervysor/start.go @@ -22,17 +22,6 @@ var startCmd = &cobra.Command{ Short: "Start a supervysed Tendermint node", DisableFlagParsing: true, RunE: func(cmd *cobra.Command, flags []string) error { - // Create Prometheus registry - reg := prometheus.NewRegistry() - m := helpers.NewMetrics(reg) - - go func() { - err := helpers.StartMetricsServer(reg) - if err != nil { - panic(err) - } - }() - // Load initialized config. config, err := getSupervysorConfig() if err != nil { @@ -41,6 +30,19 @@ var startCmd = &cobra.Command{ } metrics := config.Metrics + // Create Prometheus registry + reg := prometheus.NewRegistry() + m := helpers.NewMetrics(reg) + + if metrics { + go func() { + err := helpers.StartMetricsServer(reg, metricsPort) + if err != nil { + panic(err) + } + }() + } + e := executor.NewExecutor(&logger, config) // Start data source node initially. @@ -97,17 +99,29 @@ var startCmd = &cobra.Command{ if config.PruningInterval != 0 { logger.Info("current pruning count", "pruning-count", fmt.Sprintf("%.2f", pruningCount), "pruning-threshold", config.PruningInterval) - if pruningCount > float64(config.PruningInterval) && currentMode == "ghost" && nodeHeight > 0 { - pruneHeight := poolHeight - if nodeHeight < poolHeight { - pruneHeight = nodeHeight - } - logger.Info("pruning blocks after node shutdown", "until-height", pruneHeight) - - err = e.PruneBlocks(config.HomePath, pruneHeight-1, flags) - if err != nil { - logger.Error("could not prune blocks", "err", err) - return err + if pruningCount > float64(config.PruningInterval) && nodeHeight > 0 { + if currentMode == "ghost" { + pruneHeight := poolHeight + if nodeHeight < poolHeight { + pruneHeight = nodeHeight + } + logger.Info("pruning blocks after node shutdown", "until-height", pruneHeight) + + err = e.PruneBlocks(config.HomePath, pruneHeight-1, flags) + if err != nil { + logger.Error("could not prune blocks", "err", err) + return err + } + } else { + if nodeHeight < poolHeight { + logger.Info("pruning blocks after node shutdown", "until-height", nodeHeight) + + err = e.PruneBlocks(config.HomePath, nodeHeight-1, flags) + if err != nil { + logger.Error("could not prune blocks", "err", err) + return err + } + } } pruningCount = 0 } diff --git a/executor/executor.go b/executor/executor.go index 34743c9..fdd1f3d 100644 --- a/executor/executor.go +++ b/executor/executor.go @@ -136,7 +136,7 @@ func (e *Executor) PruneBlocks(homePath string, pruneHeight int, flags []string) } func (e *Executor) GetHeight() (int, error) { - return node.GetNodeHeight(e.Logger, &e.Process, e.Cfg.ABCIEndpoint, 0) + return node.GetNodeHeight(e.Logger, &e.Process, e.Cfg.ABCIEndpoint) } func (e *Executor) Shutdown() error { diff --git a/node/node.go b/node/node.go index 9c5aa28..b40cb0c 100644 --- a/node/node.go +++ b/node/node.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "io" + "math" "net/http" "os" "os/exec" @@ -21,24 +22,24 @@ import ( ) // The GetNodeHeight function retrieves the height of the node by querying the ABCI endpoint. -// It uses recursion with a maximum depth of 10 to handle delays or failures. -// It returns the nodeHeight if successful or an error message if the recursion depth reaches the limit (200s). -func GetNodeHeight(log log.Logger, p *types.ProcessType, abciEndpoint string, recursionDepth int) (int, error) { - if recursionDepth < 10 { +// It uses exponential backoff +func GetNodeHeight(log log.Logger, p *types.ProcessType, abciEndpoint string) (int, error) { + for i := 0; i <= types.BackoffMaxRetries; i++ { + delay := time.Duration(math.Pow(2, float64(i))) * time.Second if p.Id == -1 { - log.Error(fmt.Sprintf("node hasn't started yet. Try again in 20s ... (%d/10)", recursionDepth+1)) + log.Error(fmt.Sprintf("node hasn't started yet. Try again in %vs ...", delay)) - time.Sleep(time.Second * 20) - return GetNodeHeight(log, p, abciEndpoint, recursionDepth+1) + time.Sleep(delay) + continue } response, err := http.Get(abciEndpoint + "/abci_info?") if err != nil { - log.Error(fmt.Sprintf("failed to query height. Try again in 20s ... (%d/10)", recursionDepth+1)) + log.Error(fmt.Sprintf("failed to query height. Try again in %vs ...", delay)) - time.Sleep(time.Second * 20) - return GetNodeHeight(log, p, abciEndpoint, recursionDepth+1) + time.Sleep(delay) + continue } else { responseData, err := io.ReadAll(response.Body) if err != nil { @@ -59,9 +60,8 @@ func GetNodeHeight(log log.Logger, p *types.ProcessType, abciEndpoint string, re return nodeHeight, nil } - } else { - return 0, fmt.Errorf("could not get node height, exiting ...") } + return 0, fmt.Errorf("could not query node height") } // StartNode starts the node process in Normal Mode and returns the os.Process object representing diff --git a/types/constants.go b/types/constants.go index e1be5e9..11fc38d 100644 --- a/types/constants.go +++ b/types/constants.go @@ -15,3 +15,7 @@ var ( "https://api-us-1.kyve.network", } ) + +const ( + BackoffMaxRetries = 15 +) diff --git a/types/types.go b/types/types.go index b2119f6..c0bff5d 100644 --- a/types/types.go +++ b/types/types.go @@ -10,16 +10,17 @@ type SupervysorConfig struct { ABCIEndpoint string BinaryPath string ChainId string - HomePath string - PoolId int - Seeds string FallbackEndpoints string - PruningInterval int - StateRequests bool - Interval int HeightDifferenceMax int HeightDifferenceMin int + HomePath string + Interval int Metrics bool + MetricsPort int + PoolId int + PruningInterval int + Seeds string + StateRequests bool } type Config = tmCfg.Config