Merge branch 'hotfix-1.8.x' into hotfix-1.9.x

gentics · Apr 19, 2023 · 0b113de · 0b113de
2 parents dfbdd64 + 880aba6
commit 0b113de
Show file tree

Hide file tree

Showing 19 changed files with 274 additions and 33 deletions.
diff --git a/LTS-CHANGELOG.adoc b/LTS-CHANGELOG.adoc
@@ -18,13 +18,23 @@ The LTS changelog lists releases which are only accessible via a commercial subs
 All fixes and changes in LTS releases will be released the next minor release. Changes from LTS 1.4.x will be included in release 1.5.0.
 
 [[v1.9.14]]
-== 1.9.14 (TBD)
+== 1.9.14 (19.04.2023)
+
+icon:check[] Core: The name of a Mesh user is now forced to be unique on a database level, to prevent creating users with an already existing username. The duplicated username detection mechanism has also been improved. 
+
+CAUTION: Duplicate usernames must be removed before the update, otherwise Mesh will fail to start!
 
 icon:check[] Monitoring: Calling the `/health/ready` endpoint while restoring an OrientDB backup would block the request. If too many requests were blocked in this way, Mesh would
 be considered not live any more. This has been fixed now, during a restore, the endpoint `/health/ready` will now fail immediately while the liveness status will be maintained.
 
 icon:check[] Monitoring: Failing calls to any `/health/...` endpoints will no longer log the whole stacktrace, since this does not contain useful information.
 
+icon:check[] Core: Migration jobs have been made more robust: Migration jobs will now be aborted, in cases were the storage is no longer ready to be written to
+(e.g. write quorum is not reached, or the storage is read-only due to insufficient disk space available). A periodic check (interval can be configured via setting `migrationTriggerInterval`)
+will continue processing of aborted jobs.
+
+icon:check[] Clustering: when running in CUD coordination mode, the requests to trigger job migration were not delegated to the current master. This has been fixed now.
+
 [[v1.9.13]]
 == 1.9.13 (22.03.2023)
 
@@ -55,13 +65,23 @@ icon:check[] Core: The node migration process has been improved to reduce resour
 icon:check[] Core: Corner case of updating the webroot info might throw a false conflict exception, when the segment field value is reset for a schema. This has been fixed.
 
 [[v1.8.21]]
-== 1.8.21 (TBD)
+== 1.8.21 (19.04.2023)
+
+icon:check[] Core: The name of a Mesh user is now forced to be unique on a database level, to prevent creating users with an already existing username. The duplicated username detection mechanism has also been improved. 
+
+CAUTION: Duplicate usernames must be removed before the update, otherwise Mesh will fail to start!
 
 icon:check[] Monitoring: Calling the `/health/ready` endpoint while restoring an OrientDB backup would block the request. If too many requests were blocked in this way, Mesh would
 be considered not live any more. This has been fixed now, during a restore, the endpoint `/health/ready` will now fail immediately while the liveness status will be maintained.
 
 icon:check[] Monitoring: Failing calls to any `/health/...` endpoints will no longer log the whole stacktrace, since this does not contain useful information.
 
+icon:check[] Core: Migration jobs have been made more robust: Migration jobs will now be aborted, in cases were the storage is no longer ready to be written to
+(e.g. write quorum is not reached, or the storage is read-only due to insufficient disk space available). A periodic check (interval can be configured via setting `migrationTriggerInterval`)
+will continue processing of aborted jobs.
+
+icon:check[] Clustering: when running in CUD coordination mode, the requests to trigger job migration were not delegated to the current master. This has been fixed now.
+
 [[v1.8.20]]
 == 1.8.20 (22.03.2023)
 

diff --git a/api/src/main/java/com/gentics/mesh/etc/config/MeshOptions.java b/api/src/main/java/com/gentics/mesh/etc/config/MeshOptions.java
@@ -24,6 +24,7 @@ public abstract class MeshOptions implements Option {
 	public static final String DEFAULT_DIRECTORY_NAME = "graphdb";
 	public static final int DEFAULT_MAX_DEPTH = 10;
 	public static final int DEFAULT_PLUGIN_TIMEOUT = 120;
+	public static final long DEFAULT_MIGRATION_TRIGGER_INTERVAL = 60_000;
 
 	public static final String MESH_DEFAULT_LANG_ENV = "MESH_DEFAULT_LANG";
 	public static final String MESH_LANGUAGES_FILE_PATH_ENV = "MESH_LANGUAGES_FILE_PATH";
@@ -39,7 +40,9 @@ public abstract class MeshOptions implements Option {
 	public static final String MESH_INITIAL_ADMIN_PASSWORD_ENV = "MESH_INITIAL_ADMIN_PASSWORD";
 	public static final String MESH_INITIAL_ADMIN_PASSWORD_FORCE_RESET_ENV = "MESH_INITIAL_ADMIN_PASSWORD_FORCE_RESET";
 	public static final String MESH_MAX_PURGE_BATCH_SIZE = "MESH_MAX_PURGE_BATCH_SIZE";
-	private static final String MESH_MAX_MIGRATION_BATCH_SIZE = "MESH_MAX_MIGRATION_BATCH_SIZE";
+	public static final String MESH_MAX_MIGRATION_BATCH_SIZE = "MESH_MAX_MIGRATION_BATCH_SIZE";
+	public static final String MESH_MIGRATION_TRIGGER_INTERVAL = "MESH_MIGRATION_TRIGGER_INTERVAL";
+
 
 	// TODO remove this setting. There should not be a default max depth. This is no longer needed once we remove the expand all parameter
 	private int defaultMaxDepth = DEFAULT_MAX_DEPTH;
@@ -143,6 +146,11 @@ public abstract class MeshOptions implements Option {
 	@EnvironmentVariable(name = MESH_MAX_MIGRATION_BATCH_SIZE, description = "Override the maximum migration batch size")
 	private int migrationMaxBatchSize = 50;
 
+	@JsonProperty(required = false)
+	@JsonPropertyDescription("Interval in ms for the automatic migration job trigger. Setting this to a non-positive value will disable automatic job triggering. Default: " + DEFAULT_MIGRATION_TRIGGER_INTERVAL + " ms.")
+	@EnvironmentVariable(name = MESH_MIGRATION_TRIGGER_INTERVAL, description = "Override the migration trigger interval")
+	private long migrationTriggerInterval = DEFAULT_MIGRATION_TRIGGER_INTERVAL;
+
 	@JsonProperty(required = true)
 	@JsonPropertyDescription("GraphQL options.")
 	private GraphQLOptions graphQLOptions = new GraphQLOptions();
@@ -520,6 +528,20 @@ public void setMigrationMaxBatchSize(int migrationMaxBatchSize) {
 		this.migrationMaxBatchSize = migrationMaxBatchSize;
 	}
 
+	/**
+	 * Get the automatic job migration trigger interval in ms.
+	 * @return interval in ms
+	 */
+	public long getMigrationTriggerInterval() {
+		return migrationTriggerInterval;
+	}
+
+	@Setter
+	public MeshOptions setMigrationTriggerInterval(long migrationTriggerInterval) {
+		this.migrationTriggerInterval = migrationTriggerInterval;
+		return this;
+	}
+
 	/**
 	 * Validate this and the nested options.
 	 */

diff --git a/core/src/main/java/com/gentics/mesh/core/migration/AbstractMigrationHandler.java b/core/src/main/java/com/gentics/mesh/core/migration/AbstractMigrationHandler.java
@@ -35,6 +35,7 @@
 import com.gentics.mesh.core.rest.node.FieldMap;
 import com.gentics.mesh.core.rest.node.FieldMapImpl;
 import com.gentics.mesh.core.rest.node.field.Field;
+import com.gentics.mesh.distributed.RequestDelegator;
 import com.gentics.mesh.etc.config.MeshOptions;
 import com.gentics.mesh.event.EventQueueBatch;
 import com.gentics.mesh.metric.MetricsService;
@@ -61,13 +62,19 @@ public abstract class AbstractMigrationHandler extends AbstractHandler implement
 
 	protected final MeshOptions options;
 
+	private final RequestDelegator delegator;
+
+	private final boolean clusteringEnabled;
+
 	public AbstractMigrationHandler(Database db, BinaryUploadHandlerImpl binaryFieldHandler, MetricsService metrics,
-									Provider<EventQueueBatch> batchProvider, MeshOptions options) {
+									Provider<EventQueueBatch> batchProvider, MeshOptions options, RequestDelegator delegator) {
 		this.db = db;
 		this.binaryFieldHandler = binaryFieldHandler;
 		this.metrics = metrics;
 		this.batchProvider = batchProvider;
 		this.options = options;
+		this.delegator = delegator;
+		clusteringEnabled = this.options.getClusterOptions().isEnabled();
 	}
 
 	/**
@@ -136,6 +143,28 @@ protected <T> List<Exception> migrateLoop(Queue<T> containers, EventCauseInfo ca
 		sqb.setCause(cause);
 		int pollCount = options.getMigrationMaxBatchSize();
 		while (!containers.isEmpty()) {
+			// check whether the database is ready for the migration
+			if (db.isReadOnly(false)) {
+				errorsDetected.add(new MigrationAbortedException("Database is read-only."));
+				return errorsDetected;
+			}
+			if (clusteringEnabled && db.clusterManager().isClusterTopologyLocked()) {
+				errorsDetected.add(new MigrationAbortedException("Cluster is locked due to topology change."));
+				return errorsDetected;
+			}
+			if (clusteringEnabled && !db.clusterManager().isWriteQuorumReached()) {
+				errorsDetected.add(new MigrationAbortedException("Write quorum not reached."));
+				return errorsDetected;
+			}
+			if (clusteringEnabled && !db.clusterManager().isLocalNodeOnline()) {
+				errorsDetected.add(new MigrationAbortedException("Local node is not online."));
+				return errorsDetected;
+			}
+			if (clusteringEnabled && !delegator.isMaster()) {
+				errorsDetected.add(new MigrationAbortedException("Instance is not the master."));
+				return errorsDetected;
+			}
+
 			List<T> containerList = CollectionUtil.pollMany(containers, pollCount);
 			try {
 				// Each container migration has its own search queue batch which is then combined with other batch entries.

diff --git a/core/src/main/java/com/gentics/mesh/core/migration/impl/BranchMigrationImpl.java b/core/src/main/java/com/gentics/mesh/core/migration/impl/BranchMigrationImpl.java
@@ -33,6 +33,7 @@
 import com.gentics.mesh.core.migration.BranchMigration;
 import com.gentics.mesh.core.rest.event.node.BranchMigrationCause;
 import com.gentics.mesh.core.result.Result;
+import com.gentics.mesh.distributed.RequestDelegator;
 import com.gentics.mesh.etc.config.MeshOptions;
 import com.gentics.mesh.event.EventQueueBatch;
 import com.gentics.mesh.metric.MetricsService;
@@ -51,8 +52,9 @@ public class BranchMigrationImpl extends AbstractMigrationHandler implements Bra
 	private static final Logger log = LoggerFactory.getLogger(BranchMigrationImpl.class);
 
 	@Inject
-	public BranchMigrationImpl(Database db, BinaryUploadHandlerImpl nodeFieldAPIHandler, MetricsService metrics, Provider<EventQueueBatch> batchProvider, MeshOptions options) {
-		super(db, nodeFieldAPIHandler, metrics, batchProvider, options);
+	public BranchMigrationImpl(Database db, BinaryUploadHandlerImpl nodeFieldAPIHandler, MetricsService metrics,
+			Provider<EventQueueBatch> batchProvider, MeshOptions options, RequestDelegator delegator) {
+		super(db, nodeFieldAPIHandler, metrics, batchProvider, options, delegator);
 	}
 
 	@Override
@@ -112,7 +114,11 @@ public Completable migrateBranch(BranchMigrationContext context) {
 						log.error("Encountered migration error.", error);
 					}
 				}
-				result = Completable.error(new CompositeException(errorsDetected));
+				if (errorsDetected.size() == 1) {
+					result = Completable.error(errorsDetected.get(0));
+				} else {
+					result = Completable.error(new CompositeException(errorsDetected));
+				}
 			}
 			return result;
 		});

diff --git a/core/src/main/java/com/gentics/mesh/core/migration/impl/MicronodeMigrationImpl.java b/core/src/main/java/com/gentics/mesh/core/migration/impl/MicronodeMigrationImpl.java
@@ -42,6 +42,7 @@
 import com.gentics.mesh.core.rest.schema.FieldSchemaContainer;
 import com.gentics.mesh.core.rest.schema.ListFieldSchema;
 import com.gentics.mesh.core.verticle.handler.WriteLock;
+import com.gentics.mesh.distributed.RequestDelegator;
 import com.gentics.mesh.etc.config.MeshOptions;
 import com.gentics.mesh.event.EventQueueBatch;
 import com.gentics.mesh.metric.MetricsService;
@@ -62,8 +63,10 @@ public class MicronodeMigrationImpl extends AbstractMigrationHandler implements
 	private final WriteLock writeLock;
 
 	@Inject
-	public MicronodeMigrationImpl(Database db, BinaryUploadHandlerImpl binaryFieldHandler, MetricsService metrics, Provider<EventQueueBatch> batchProvider, WriteLock writeLock, MeshOptions options) {
-		super(db, binaryFieldHandler, metrics, batchProvider, options);
+	public MicronodeMigrationImpl(Database db, BinaryUploadHandlerImpl binaryFieldHandler, MetricsService metrics,
+			Provider<EventQueueBatch> batchProvider, WriteLock writeLock, MeshOptions options,
+			RequestDelegator delegator) {
+		super(db, binaryFieldHandler, metrics, batchProvider, options, delegator);
 		this.writeLock = writeLock;
 	}
 
@@ -130,7 +133,11 @@ public Completable migrateMicronodes(MicronodeMigrationContext context) {
 						log.error("Encountered migration error.", error);
 					}
 				}
-				result = Completable.error(new CompositeException(errorsDetected));
+				if (errorsDetected.size() == 1) {
+					result = Completable.error(errorsDetected.get(0));
+				} else {
+					result = Completable.error(new CompositeException(errorsDetected));
+				}
 			}
 			return result;
 		});

diff --git a/core/src/main/java/com/gentics/mesh/core/migration/impl/NodeMigrationImpl.java b/core/src/main/java/com/gentics/mesh/core/migration/impl/NodeMigrationImpl.java
@@ -46,10 +46,12 @@
 import com.gentics.mesh.core.endpoint.migration.MigrationStatusHandler;
 import com.gentics.mesh.core.endpoint.node.BinaryUploadHandlerImpl;
 import com.gentics.mesh.core.migration.AbstractMigrationHandler;
+import com.gentics.mesh.core.migration.MigrationAbortedException;
 import com.gentics.mesh.core.migration.NodeMigration;
 import com.gentics.mesh.core.rest.common.FieldContainer;
 import com.gentics.mesh.core.rest.event.node.SchemaMigrationCause;
 import com.gentics.mesh.core.verticle.handler.WriteLock;
+import com.gentics.mesh.distributed.RequestDelegator;
 import com.gentics.mesh.etc.config.MeshOptions;
 import com.gentics.mesh.event.EventQueueBatch;
 import com.gentics.mesh.metric.MetricsService;
@@ -74,8 +76,8 @@ public class NodeMigrationImpl extends AbstractMigrationHandler implements NodeM
 
 	@Inject
 	public NodeMigrationImpl(Database db, BinaryUploadHandlerImpl nodeFieldAPIHandler, MetricsService metrics, Provider<EventQueueBatch> batchProvider,
-							 WriteLock writeLock, MeshOptions options) {
-		super(db, nodeFieldAPIHandler, metrics, batchProvider, options);
+							 WriteLock writeLock, MeshOptions options, RequestDelegator delegator) {
+		super(db, nodeFieldAPIHandler, metrics, batchProvider, options, delegator);
 		migrationGauge = metrics.longGauge(NODE_MIGRATION_PENDING);
 		this.writeLock = writeLock;
 	}
@@ -227,7 +229,18 @@ public Completable migrateNodes(NodeMigrationActionContext context) {
 						migrationGauge.decrementAndGet();
 					}
 				});
-			} while (batchSize > 0 && currentBatch > 0 && currentBatch >= batchSize);			
+
+				// when containers is not empty, something bad happened and we need to let the migration fail immediately
+				if (!containers.isEmpty()) {
+					if (errorsDetected.size() > 1) {
+						return Completable.error(new CompositeException(errorsDetected));
+					} else if (errorsDetected.size() == 1) {
+						return Completable.error(errorsDetected.get(0));
+					} else {
+						return Completable.error(new MigrationAbortedException("Not all containers of the current batch were migrated."));
+					}
+				}
+			} while (batchSize > 0 && currentBatch > 0 && currentBatch >= batchSize);
 
 			// TODO prepare errors. They should be easy to understand and to grasp
 			Completable result = Completable.complete();
@@ -237,7 +250,11 @@ public Completable migrateNodes(NodeMigrationActionContext context) {
 						log.error("Encountered migration error.", error);
 					}
 				}
-				result = Completable.error(new CompositeException(errorsDetected));
+				if (errorsDetected.size() == 1) {
+					result = Completable.error(errorsDetected.get(0));
+				} else {
+					result = Completable.error(new CompositeException(errorsDetected));
+				}
 			}
 			return result;
 		});

diff --git a/core/src/main/java/com/gentics/mesh/core/verticle/job/JobWorkerVerticleImpl.java b/core/src/main/java/com/gentics/mesh/core/verticle/job/JobWorkerVerticleImpl.java
@@ -8,6 +8,8 @@
 import com.gentics.mesh.cli.BootstrapInitializer;
 import com.gentics.mesh.core.db.Database;
 import com.gentics.mesh.core.jobs.JobProcessor;
+import com.gentics.mesh.distributed.RequestDelegator;
+import com.gentics.mesh.etc.config.MeshOptions;
 import com.gentics.mesh.verticle.AbstractJobVerticle;
 
 import dagger.Lazy;
@@ -35,12 +37,39 @@ public class JobWorkerVerticleImpl extends AbstractJobVerticle implements JobWor
 	private Lazy<BootstrapInitializer> boot;
 	private JobProcessor jobProcessor;
 	private Database db;
+	private final RequestDelegator delegator;
+	private final boolean clusteringEnabled;
 
 	@Inject
-	public JobWorkerVerticleImpl(Database db, Lazy<BootstrapInitializer> boot, JobProcessor jobProcessor) {
+	public JobWorkerVerticleImpl(Database db, Lazy<BootstrapInitializer> boot, JobProcessor jobProcessor,
+			MeshOptions options, RequestDelegator delegator) {
 		this.db = db;
 		this.boot = boot;
 		this.jobProcessor = jobProcessor;
+		this.delegator = delegator;
+		this.clusteringEnabled = options.getClusterOptions().isEnabled();
+	}
+
+	@Override
+	public void start() throws Exception {
+		super.start();
+
+		long migrationTriggerInterval = boot.get().mesh().getOptions().getMigrationTriggerInterval();
+
+		if (migrationTriggerInterval > 0) {
+			vertx.setPeriodic(migrationTriggerInterval, id -> {
+				if (!isCurrentMaster()) {
+					log.debug("Not invoking job processing, because instance is not the current master");
+				} else if(!isDatabaseReadyForJobs()) {
+					log.debug("Not invoking job processing, because instance is not ready to process jobs");
+				} else if (jobProcessor.isProcessing()) {
+					log.debug("Not invoking job processing, because jobs are currently processed");
+				} else {
+					log.debug("Invoke job processing");
+					vertx.eventBus().publish(getJobAdress(), null);
+				}
+			});
+		}
 	}
 
 	@Override
@@ -58,4 +87,32 @@ public Completable executeJob(Message<Object> message) {
 		return Completable.defer(() -> jobProcessor.process());
 	}
 
+	/**
+	 * Check whether the instance is currently the master
+	 * @return true for the master (or clustering not enabled)
+	 */
+	private boolean isCurrentMaster() {
+		if (clusteringEnabled) {
+			return delegator.isMaster();
+		} else {
+			return true;
+		}
+	}
+
+	/**
+	 * Check whether the database is ready to process jobs. When clustering is enabled, this will check whether
+	 * <ol>
+	 * <li>The local database is online</li>
+	 * <li>The write quorum is reached</li>
+	 * <li>The cluster is not locked due to topology changes</li>
+	 * </ol>
+	 * @return true when the database is ready for job processing
+	 */
+	private boolean isDatabaseReadyForJobs() {
+		if (clusteringEnabled) {
+			return db.clusterManager().isLocalNodeOnline() && db.clusterManager().isWriteQuorumReached() && !db.clusterManager().isClusterTopologyLocked();
+		} else {
+			return true;
+		}
+	}
 }