diff --git a/server/src/internalClusterTest/java/org/opensearch/gateway/RecoveryFromGatewayIT.java b/server/src/internalClusterTest/java/org/opensearch/gateway/RecoveryFromGatewayIT.java index 9da1336642a64..6c248a32c9928 100644 --- a/server/src/internalClusterTest/java/org/opensearch/gateway/RecoveryFromGatewayIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/gateway/RecoveryFromGatewayIT.java @@ -32,10 +32,12 @@ package org.opensearch.gateway; +import org.opensearch.Version; import org.opensearch.action.admin.cluster.configuration.AddVotingConfigExclusionsAction; import org.opensearch.action.admin.cluster.configuration.AddVotingConfigExclusionsRequest; import org.opensearch.action.admin.cluster.configuration.ClearVotingConfigExclusionsAction; import org.opensearch.action.admin.cluster.configuration.ClearVotingConfigExclusionsRequest; +import org.opensearch.action.admin.cluster.reroute.ClusterRerouteResponse; import org.opensearch.action.admin.cluster.shards.ClusterSearchShardsGroup; import org.opensearch.action.admin.cluster.shards.ClusterSearchShardsResponse; import org.opensearch.action.admin.indices.recovery.RecoveryResponse; @@ -46,6 +48,7 @@ import org.opensearch.cluster.coordination.ElectionSchedulerFactory; import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.cluster.node.DiscoveryNode; +import org.opensearch.cluster.routing.ShardRouting; import org.opensearch.cluster.routing.UnassignedInfo; import org.opensearch.cluster.service.ClusterService; import org.opensearch.common.settings.Settings; @@ -63,6 +66,8 @@ import org.opensearch.indices.recovery.RecoveryState; import org.opensearch.indices.replication.common.ReplicationLuceneIndex; import org.opensearch.indices.store.ShardAttributes; +import org.opensearch.indices.store.TransportNodesListShardStoreMetadataBatch; +import org.opensearch.indices.store.TransportNodesListShardStoreMetadataHelper; import org.opensearch.plugins.Plugin; import org.opensearch.test.InternalSettingsPlugin; import org.opensearch.test.InternalTestCluster.RestartCallback; @@ -82,8 +87,11 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.concurrent.ExecutionException; import java.util.stream.IntStream; +import static java.util.Collections.emptyMap; +import static java.util.Collections.emptySet; import static org.opensearch.cluster.coordination.ClusterBootstrapService.INITIAL_CLUSTER_MANAGER_NODES_SETTING; import static org.opensearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_REPLICAS; import static org.opensearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_SHARDS; @@ -817,6 +825,131 @@ public void testShardFetchCorruptedShardsUsingBatchAction() throws Exception { assertTrue(nodeGatewayStartedShards.primary()); } + public void testSingleShardStoreFetchUsingBatchAction() throws ExecutionException, InterruptedException { + String indexName = "test"; + DiscoveryNode[] nodes = getDiscoveryNodes(); + TransportNodesListShardStoreMetadataBatch.NodesStoreFilesMetadataBatch response = prepareAndSendRequest( + new String[] { indexName }, + nodes + ); + Index index = resolveIndex(indexName); + ShardId shardId = new ShardId(index, 0); + TransportNodesListShardStoreMetadataBatch.NodeStoreFilesMetadata nodeStoreFilesMetadata = response.getNodesMap() + .get(nodes[0].getId()) + .getNodeStoreFilesMetadataBatch() + .get(shardId); + assertNodeStoreFilesMetadataSuccessCase(nodeStoreFilesMetadata, shardId); + } + + public void testShardStoreFetchMultiNodeMultiIndexesUsingBatchAction() throws Exception { + internalCluster().startNodes(2); + String indexName1 = "test1"; + String indexName2 = "test2"; + DiscoveryNode[] nodes = getDiscoveryNodes(); + TransportNodesListShardStoreMetadataBatch.NodesStoreFilesMetadataBatch response = prepareAndSendRequest( + new String[] { indexName1, indexName2 }, + nodes + ); + ClusterSearchShardsResponse searchShardsResponse = client().admin().cluster().prepareSearchShards(indexName1, indexName2).get(); + for (ClusterSearchShardsGroup clusterSearchShardsGroup : searchShardsResponse.getGroups()) { + ShardId shardId = clusterSearchShardsGroup.getShardId(); + ShardRouting[] shardRoutings = clusterSearchShardsGroup.getShards(); + assertEquals(2, shardRoutings.length); + for (ShardRouting shardRouting : shardRoutings) { + TransportNodesListShardStoreMetadataBatch.NodeStoreFilesMetadata nodeStoreFilesMetadata = response.getNodesMap() + .get(shardRouting.currentNodeId()) + .getNodeStoreFilesMetadataBatch() + .get(shardId); + assertNodeStoreFilesMetadataSuccessCase(nodeStoreFilesMetadata, shardId); + } + } + } + + public void testShardStoreFetchNodeNotConnectedUsingBatchAction() { + DiscoveryNode nonExistingNode = new DiscoveryNode("foo", buildNewFakeTransportAddress(), emptyMap(), emptySet(), Version.CURRENT); + String indexName = "test"; + TransportNodesListShardStoreMetadataBatch.NodesStoreFilesMetadataBatch response = prepareAndSendRequest( + new String[] { indexName }, + new DiscoveryNode[] { nonExistingNode } + ); + assertTrue(response.hasFailures()); + assertEquals(1, response.failures().size()); + assertEquals(nonExistingNode.getId(), response.failures().get(0).nodeId()); + } + + public void testShardStoreFetchCorruptedIndexUsingBatchAction() throws Exception { + internalCluster().startNodes(2); + String index1Name = "test1"; + String index2Name = "test2"; + prepareIndices(new String[] { index1Name, index2Name }, 1, 1); + Map shardAttributesMap = prepareRequestMap(new String[] { index1Name, index2Name }, 1); + Index index1 = resolveIndex(index1Name); + ShardId shardId1 = new ShardId(index1, 0); + ClusterSearchShardsResponse searchShardsResponse = client().admin().cluster().prepareSearchShards(index1Name).get(); + assertEquals(2, searchShardsResponse.getNodes().length); + + // corrupt test1 index shards + corruptShard(searchShardsResponse.getNodes()[0].getName(), shardId1); + corruptShard(searchShardsResponse.getNodes()[1].getName(), shardId1); + ClusterRerouteResponse clusterRerouteResponse = client().admin().cluster().prepareReroute().setRetryFailed(false).get(); + DiscoveryNode[] discoveryNodes = getDiscoveryNodes(); + TransportNodesListShardStoreMetadataBatch.NodesStoreFilesMetadataBatch response; + response = ActionTestUtils.executeBlocking( + internalCluster().getInstance(TransportNodesListShardStoreMetadataBatch.class), + new TransportNodesListShardStoreMetadataBatch.Request(shardAttributesMap, discoveryNodes) + ); + Map nodeStoreFilesMetadata = response.getNodesMap() + .get(discoveryNodes[0].getId()) + .getNodeStoreFilesMetadataBatch(); + // We don't store exception in case of corrupt index, rather just return an empty response + assertNull(nodeStoreFilesMetadata.get(shardId1).getStoreFileFetchException()); + assertEquals(shardId1, nodeStoreFilesMetadata.get(shardId1).storeFilesMetadata().shardId()); + assertTrue(nodeStoreFilesMetadata.get(shardId1).storeFilesMetadata().isEmpty()); + + Index index2 = resolveIndex(index2Name); + ShardId shardId2 = new ShardId(index2, 0); + assertNodeStoreFilesMetadataSuccessCase(nodeStoreFilesMetadata.get(shardId2), shardId2); + } + + private void prepareIndices(String[] indices, int numberOfPrimaryShards, int numberOfReplicaShards) { + for (String index : indices) { + createIndex( + index, + Settings.builder() + .put(SETTING_NUMBER_OF_SHARDS, numberOfPrimaryShards) + .put(SETTING_NUMBER_OF_REPLICAS, numberOfReplicaShards) + .build() + ); + index(index, "type", "1", Collections.emptyMap()); + flush(index); + } + } + + private TransportNodesListShardStoreMetadataBatch.NodesStoreFilesMetadataBatch prepareAndSendRequest( + String[] indices, + DiscoveryNode[] nodes + ) { + Map shardAttributesMap = null; + prepareIndices(indices, 1, 1); + shardAttributesMap = prepareRequestMap(indices, 1); + TransportNodesListShardStoreMetadataBatch.NodesStoreFilesMetadataBatch response; + return ActionTestUtils.executeBlocking( + internalCluster().getInstance(TransportNodesListShardStoreMetadataBatch.class), + new TransportNodesListShardStoreMetadataBatch.Request(shardAttributesMap, nodes) + ); + } + + private void assertNodeStoreFilesMetadataSuccessCase( + TransportNodesListShardStoreMetadataBatch.NodeStoreFilesMetadata nodeStoreFilesMetadata, + ShardId shardId + ) { + assertNull(nodeStoreFilesMetadata.getStoreFileFetchException()); + TransportNodesListShardStoreMetadataHelper.StoreFilesMetadata storeFileMetadata = nodeStoreFilesMetadata.storeFilesMetadata(); + assertFalse(storeFileMetadata.isEmpty()); + assertEquals(shardId, storeFileMetadata.shardId()); + assertNotNull(storeFileMetadata.peerRecoveryRetentionLeases()); + } + private void assertNodeGatewayStartedShardsHappyCase( TransportNodesListGatewayStartedShardsBatch.NodeGatewayStartedShard nodeGatewayStartedShards ) { diff --git a/server/src/main/java/org/opensearch/gateway/ReplicaShardAllocator.java b/server/src/main/java/org/opensearch/gateway/ReplicaShardAllocator.java index f530052c5bcd1..89db3198662fa 100644 --- a/server/src/main/java/org/opensearch/gateway/ReplicaShardAllocator.java +++ b/server/src/main/java/org/opensearch/gateway/ReplicaShardAllocator.java @@ -51,8 +51,8 @@ import org.opensearch.common.unit.TimeValue; import org.opensearch.core.common.unit.ByteSizeValue; import org.opensearch.index.store.StoreFileMetadata; -import org.opensearch.indices.store.TransportNodesListShardStoreMetadata; import org.opensearch.indices.store.TransportNodesListShardStoreMetadata.NodeStoreFilesMetadata; +import org.opensearch.indices.store.TransportNodesListShardStoreMetadataHelper.StoreFilesMetadata; import java.util.ArrayList; import java.util.Collections; @@ -106,7 +106,7 @@ public void processExistingRecoveries(RoutingAllocation allocation) { assert primaryShard != null : "the replica shard can be allocated on at least one node, so there must be an active primary"; assert primaryShard.currentNodeId() != null; final DiscoveryNode primaryNode = allocation.nodes().get(primaryShard.currentNodeId()); - final TransportNodesListShardStoreMetadata.StoreFilesMetadata primaryStore = findStore(primaryNode, shardStores); + final StoreFilesMetadata primaryStore = findStore(primaryNode, shardStores); if (primaryStore == null) { // if we can't find the primary data, it is probably because the primary shard is corrupted (and listing failed) // just let the recovery find it out, no need to do anything about it for the initializing shard @@ -223,7 +223,7 @@ public AllocateUnassignedDecision makeAllocationDecision( } assert primaryShard.currentNodeId() != null; final DiscoveryNode primaryNode = allocation.nodes().get(primaryShard.currentNodeId()); - final TransportNodesListShardStoreMetadata.StoreFilesMetadata primaryStore = findStore(primaryNode, shardStores); + final StoreFilesMetadata primaryStore = findStore(primaryNode, shardStores); if (primaryStore == null) { // if we can't find the primary data, it is probably because the primary shard is corrupted (and listing failed) // we want to let the replica be allocated in order to expose the actual problem with the primary that the replica @@ -357,10 +357,7 @@ private static List augmentExplanationsWithStoreInfo( /** * Finds the store for the assigned shard in the fetched data, returns null if none is found. */ - private static TransportNodesListShardStoreMetadata.StoreFilesMetadata findStore( - DiscoveryNode node, - AsyncShardFetch.FetchResult data - ) { + private static StoreFilesMetadata findStore(DiscoveryNode node, AsyncShardFetch.FetchResult data) { NodeStoreFilesMetadata nodeFilesStore = data.getData().get(node); if (nodeFilesStore == null) { return null; @@ -373,7 +370,7 @@ private MatchingNodes findMatchingNodes( RoutingAllocation allocation, boolean noMatchFailedNodes, DiscoveryNode primaryNode, - TransportNodesListShardStoreMetadata.StoreFilesMetadata primaryStore, + StoreFilesMetadata primaryStore, AsyncShardFetch.FetchResult data, boolean explain ) { @@ -386,7 +383,7 @@ private MatchingNodes findMatchingNodes( && shard.unassignedInfo().getFailedNodeIds().contains(discoNode.getId())) { continue; } - TransportNodesListShardStoreMetadata.StoreFilesMetadata storeFilesMetadata = nodeStoreEntry.getValue().storeFilesMetadata(); + StoreFilesMetadata storeFilesMetadata = nodeStoreEntry.getValue().storeFilesMetadata(); // we don't have any files at all, it is an empty index if (storeFilesMetadata.isEmpty()) { continue; @@ -441,10 +438,7 @@ private MatchingNodes findMatchingNodes( return new MatchingNodes(matchingNodes, nodeDecisions); } - private static long computeMatchingBytes( - TransportNodesListShardStoreMetadata.StoreFilesMetadata primaryStore, - TransportNodesListShardStoreMetadata.StoreFilesMetadata storeFilesMetadata - ) { + private static long computeMatchingBytes(StoreFilesMetadata primaryStore, StoreFilesMetadata storeFilesMetadata) { long sizeMatched = 0; for (StoreFileMetadata storeFileMetadata : storeFilesMetadata) { String metadataFileName = storeFileMetadata.name(); @@ -455,19 +449,16 @@ private static long computeMatchingBytes( return sizeMatched; } - private static boolean hasMatchingSyncId( - TransportNodesListShardStoreMetadata.StoreFilesMetadata primaryStore, - TransportNodesListShardStoreMetadata.StoreFilesMetadata replicaStore - ) { + private static boolean hasMatchingSyncId(StoreFilesMetadata primaryStore, StoreFilesMetadata replicaStore) { String primarySyncId = primaryStore.syncId(); return primarySyncId != null && primarySyncId.equals(replicaStore.syncId()); } private static MatchingNode computeMatchingNode( DiscoveryNode primaryNode, - TransportNodesListShardStoreMetadata.StoreFilesMetadata primaryStore, + StoreFilesMetadata primaryStore, DiscoveryNode replicaNode, - TransportNodesListShardStoreMetadata.StoreFilesMetadata replicaStore + StoreFilesMetadata replicaStore ) { final long retainingSeqNoForPrimary = primaryStore.getPeerRecoveryRetentionLeaseRetainingSeqNo(primaryNode); final long retainingSeqNoForReplica = primaryStore.getPeerRecoveryRetentionLeaseRetainingSeqNo(replicaNode); @@ -478,7 +469,7 @@ private static MatchingNode computeMatchingNode( } private static boolean canPerformOperationBasedRecovery( - TransportNodesListShardStoreMetadata.StoreFilesMetadata primaryStore, + StoreFilesMetadata primaryStore, AsyncShardFetch.FetchResult shardStores, DiscoveryNode targetNode ) { diff --git a/server/src/main/java/org/opensearch/indices/IndicesModule.java b/server/src/main/java/org/opensearch/indices/IndicesModule.java index eea5dbbf57f6c..b86e98f4ebcbc 100644 --- a/server/src/main/java/org/opensearch/indices/IndicesModule.java +++ b/server/src/main/java/org/opensearch/indices/IndicesModule.java @@ -81,6 +81,7 @@ import org.opensearch.indices.replication.checkpoint.SegmentReplicationCheckpointPublisher; import org.opensearch.indices.store.IndicesStore; import org.opensearch.indices.store.TransportNodesListShardStoreMetadata; +import org.opensearch.indices.store.TransportNodesListShardStoreMetadataBatch; import org.opensearch.plugins.MapperPlugin; import java.util.ArrayList; @@ -281,6 +282,7 @@ protected void configure() { bind(IndicesStore.class).asEagerSingleton(); bind(IndicesClusterStateService.class).asEagerSingleton(); bind(TransportNodesListShardStoreMetadata.class).asEagerSingleton(); + bind(TransportNodesListShardStoreMetadataBatch.class).asEagerSingleton(); bind(GlobalCheckpointSyncAction.class).asEagerSingleton(); bind(TransportResyncReplicationAction.class).asEagerSingleton(); bind(PrimaryReplicaSyncer.class).asEagerSingleton(); diff --git a/server/src/main/java/org/opensearch/indices/store/TransportNodesListShardStoreMetadata.java b/server/src/main/java/org/opensearch/indices/store/TransportNodesListShardStoreMetadata.java index 5a3c1038cd5f0..eeee5d8a409aa 100644 --- a/server/src/main/java/org/opensearch/indices/store/TransportNodesListShardStoreMetadata.java +++ b/server/src/main/java/org/opensearch/indices/store/TransportNodesListShardStoreMetadata.java @@ -32,7 +32,6 @@ package org.opensearch.indices.store; -import org.apache.logging.log4j.message.ParameterizedMessage; import org.opensearch.OpenSearchException; import org.opensearch.action.ActionType; import org.opensearch.action.FailedNodeException; @@ -42,40 +41,29 @@ import org.opensearch.action.support.nodes.BaseNodesResponse; import org.opensearch.action.support.nodes.TransportNodesAction; import org.opensearch.cluster.ClusterName; -import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.cluster.node.DiscoveryNode; import org.opensearch.cluster.service.ClusterService; import org.opensearch.common.Nullable; import org.opensearch.common.inject.Inject; import org.opensearch.common.settings.Settings; -import org.opensearch.common.unit.TimeValue; import org.opensearch.core.action.ActionListener; import org.opensearch.core.common.io.stream.StreamInput; import org.opensearch.core.common.io.stream.StreamOutput; -import org.opensearch.core.common.io.stream.Writeable; import org.opensearch.core.index.shard.ShardId; import org.opensearch.env.NodeEnvironment; import org.opensearch.gateway.AsyncShardFetch; -import org.opensearch.index.IndexService; -import org.opensearch.index.IndexSettings; -import org.opensearch.index.seqno.ReplicationTracker; -import org.opensearch.index.seqno.RetentionLease; -import org.opensearch.index.shard.IndexShard; -import org.opensearch.index.shard.ShardPath; -import org.opensearch.index.store.Store; -import org.opensearch.index.store.StoreFileMetadata; import org.opensearch.indices.IndicesService; +import org.opensearch.indices.store.TransportNodesListShardStoreMetadataHelper.StoreFilesMetadata; import org.opensearch.threadpool.ThreadPool; import org.opensearch.transport.TransportRequest; import org.opensearch.transport.TransportService; import java.io.IOException; -import java.util.Collections; -import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Objects; -import java.util.concurrent.TimeUnit; + +import static org.opensearch.indices.store.TransportNodesListShardStoreMetadataHelper.listShardMetadataInternal; /** * Metadata for shard stores from a list of transport nodes @@ -167,166 +155,7 @@ protected NodeStoreFilesMetadata nodeOperation(NodeRequest request) { private StoreFilesMetadata listStoreMetadata(NodeRequest request) throws IOException { final ShardId shardId = request.getShardId(); - logger.trace("listing store meta data for {}", shardId); - long startTimeNS = System.nanoTime(); - boolean exists = false; - try { - IndexService indexService = indicesService.indexService(shardId.getIndex()); - if (indexService != null) { - IndexShard indexShard = indexService.getShardOrNull(shardId.id()); - if (indexShard != null) { - try { - final StoreFilesMetadata storeFilesMetadata = new StoreFilesMetadata( - shardId, - indexShard.snapshotStoreMetadata(), - indexShard.getPeerRecoveryRetentionLeases() - ); - exists = true; - return storeFilesMetadata; - } catch (org.apache.lucene.index.IndexNotFoundException e) { - logger.trace(new ParameterizedMessage("[{}] node is missing index, responding with empty", shardId), e); - return new StoreFilesMetadata(shardId, Store.MetadataSnapshot.EMPTY, Collections.emptyList()); - } catch (IOException e) { - logger.warn(new ParameterizedMessage("[{}] can't read metadata from store, responding with empty", shardId), e); - return new StoreFilesMetadata(shardId, Store.MetadataSnapshot.EMPTY, Collections.emptyList()); - } - } - } - final String customDataPath; - if (request.getCustomDataPath() != null) { - customDataPath = request.getCustomDataPath(); - } else { - // TODO: Fallback for BWC with older predecessor (ES) versions. - // Remove this once request.getCustomDataPath() always returns non-null - if (indexService != null) { - customDataPath = indexService.getIndexSettings().customDataPath(); - } else { - IndexMetadata metadata = clusterService.state().metadata().index(shardId.getIndex()); - if (metadata != null) { - customDataPath = new IndexSettings(metadata, settings).customDataPath(); - } else { - logger.trace("{} node doesn't have meta data for the requests index", shardId); - throw new OpenSearchException("node doesn't have meta data for index " + shardId.getIndex()); - } - } - } - final ShardPath shardPath = ShardPath.loadShardPath(logger, nodeEnv, shardId, customDataPath); - if (shardPath == null) { - return new StoreFilesMetadata(shardId, Store.MetadataSnapshot.EMPTY, Collections.emptyList()); - } - // note that this may fail if it can't get access to the shard lock. Since we check above there is an active shard, this means: - // 1) a shard is being constructed, which means the cluster-manager will not use a copy of this replica - // 2) A shard is shutting down and has not cleared it's content within lock timeout. In this case the cluster-manager may not - // reuse local resources. - final Store.MetadataSnapshot metadataSnapshot = Store.readMetadataSnapshot( - shardPath.resolveIndex(), - shardId, - nodeEnv::shardLock, - logger - ); - // We use peer recovery retention leases from the primary for allocating replicas. We should always have retention leases when - // we refresh shard info after the primary has started. Hence, we can ignore retention leases if there is no active shard. - return new StoreFilesMetadata(shardId, metadataSnapshot, Collections.emptyList()); - } finally { - TimeValue took = new TimeValue(System.nanoTime() - startTimeNS, TimeUnit.NANOSECONDS); - if (exists) { - logger.debug("{} loaded store meta data (took [{}])", shardId, took); - } else { - logger.trace("{} didn't find any store meta data to load (took [{}])", shardId, took); - } - } - } - - /** - * Metadata for store files - * - * @opensearch.internal - */ - public static class StoreFilesMetadata implements Iterable, Writeable { - private final ShardId shardId; - private final Store.MetadataSnapshot metadataSnapshot; - private final List peerRecoveryRetentionLeases; - - public StoreFilesMetadata( - ShardId shardId, - Store.MetadataSnapshot metadataSnapshot, - List peerRecoveryRetentionLeases - ) { - this.shardId = shardId; - this.metadataSnapshot = metadataSnapshot; - this.peerRecoveryRetentionLeases = peerRecoveryRetentionLeases; - } - - public StoreFilesMetadata(StreamInput in) throws IOException { - this.shardId = new ShardId(in); - this.metadataSnapshot = new Store.MetadataSnapshot(in); - this.peerRecoveryRetentionLeases = in.readList(RetentionLease::new); - } - - @Override - public void writeTo(StreamOutput out) throws IOException { - shardId.writeTo(out); - metadataSnapshot.writeTo(out); - out.writeList(peerRecoveryRetentionLeases); - } - - public ShardId shardId() { - return this.shardId; - } - - public boolean isEmpty() { - return metadataSnapshot.size() == 0; - } - - @Override - public Iterator iterator() { - return metadataSnapshot.iterator(); - } - - public boolean fileExists(String name) { - return metadataSnapshot.asMap().containsKey(name); - } - - public StoreFileMetadata file(String name) { - return metadataSnapshot.asMap().get(name); - } - - /** - * Returns the retaining sequence number of the peer recovery retention lease for a given node if exists; otherwise, returns -1. - */ - public long getPeerRecoveryRetentionLeaseRetainingSeqNo(DiscoveryNode node) { - assert node != null; - final String retentionLeaseId = ReplicationTracker.getPeerRecoveryRetentionLeaseId(node.getId()); - return peerRecoveryRetentionLeases.stream() - .filter(lease -> lease.id().equals(retentionLeaseId)) - .mapToLong(RetentionLease::retainingSequenceNumber) - .findFirst() - .orElse(-1L); - } - - public List peerRecoveryRetentionLeases() { - return peerRecoveryRetentionLeases; - } - - /** - * @return commit sync id if exists, else null - */ - public String syncId() { - return metadataSnapshot.getSyncId(); - } - - @Override - public String toString() { - return "StoreFilesMetadata{" - + ", shardId=" - + shardId - + ", metadataSnapshot{size=" - + metadataSnapshot.size() - + ", syncId=" - + metadataSnapshot.getSyncId() - + "}" - + '}'; - } + return listShardMetadataInternal(logger, shardId, nodeEnv, indicesService, request.getCustomDataPath(), settings, clusterService); } /** diff --git a/server/src/main/java/org/opensearch/indices/store/TransportNodesListShardStoreMetadataBatch.java b/server/src/main/java/org/opensearch/indices/store/TransportNodesListShardStoreMetadataBatch.java new file mode 100644 index 0000000000000..3f151fe1c5ca0 --- /dev/null +++ b/server/src/main/java/org/opensearch/indices/store/TransportNodesListShardStoreMetadataBatch.java @@ -0,0 +1,346 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.indices.store; + +import org.opensearch.OpenSearchException; +import org.opensearch.action.ActionType; +import org.opensearch.action.FailedNodeException; +import org.opensearch.action.support.ActionFilters; +import org.opensearch.action.support.nodes.BaseNodeResponse; +import org.opensearch.action.support.nodes.BaseNodesRequest; +import org.opensearch.action.support.nodes.BaseNodesResponse; +import org.opensearch.action.support.nodes.TransportNodesAction; +import org.opensearch.cluster.ClusterName; +import org.opensearch.cluster.node.DiscoveryNode; +import org.opensearch.cluster.service.ClusterService; +import org.opensearch.common.inject.Inject; +import org.opensearch.common.settings.Settings; +import org.opensearch.core.action.ActionListener; +import org.opensearch.core.common.io.stream.StreamInput; +import org.opensearch.core.common.io.stream.StreamOutput; +import org.opensearch.core.index.shard.ShardId; +import org.opensearch.env.NodeEnvironment; +import org.opensearch.gateway.AsyncShardFetch; +import org.opensearch.index.store.Store; +import org.opensearch.indices.IndicesService; +import org.opensearch.indices.store.TransportNodesListShardStoreMetadataHelper.StoreFilesMetadata; +import org.opensearch.threadpool.ThreadPool; +import org.opensearch.transport.TransportRequest; +import org.opensearch.transport.TransportService; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +import static org.opensearch.indices.store.TransportNodesListShardStoreMetadataHelper.INDEX_NOT_FOUND; + +/** + * Transport action for fetching the batch of shard stores Metadata from a list of transport nodes + * + * @opensearch.internal + */ +public class TransportNodesListShardStoreMetadataBatch extends TransportNodesAction< + TransportNodesListShardStoreMetadataBatch.Request, + TransportNodesListShardStoreMetadataBatch.NodesStoreFilesMetadataBatch, + TransportNodesListShardStoreMetadataBatch.NodeRequest, + TransportNodesListShardStoreMetadataBatch.NodeStoreFilesMetadataBatch> + implements + AsyncShardFetch.Lister< + TransportNodesListShardStoreMetadataBatch.NodesStoreFilesMetadataBatch, + TransportNodesListShardStoreMetadataBatch.NodeStoreFilesMetadataBatch> { + + public static final String ACTION_NAME = "internal:cluster/nodes/indices/shard/store/batch"; + public static final ActionType TYPE = new ActionType<>( + ACTION_NAME, + TransportNodesListShardStoreMetadataBatch.NodesStoreFilesMetadataBatch::new + ); + + private final Settings settings; + private final IndicesService indicesService; + private final NodeEnvironment nodeEnv; + + @Inject + public TransportNodesListShardStoreMetadataBatch( + Settings settings, + ThreadPool threadPool, + ClusterService clusterService, + TransportService transportService, + IndicesService indicesService, + NodeEnvironment nodeEnv, + ActionFilters actionFilters + ) { + super( + ACTION_NAME, + threadPool, + clusterService, + transportService, + actionFilters, + Request::new, + NodeRequest::new, + ThreadPool.Names.FETCH_SHARD_STORE, + NodeStoreFilesMetadataBatch.class + ); + this.settings = settings; + this.indicesService = indicesService; + this.nodeEnv = nodeEnv; + } + + @Override + public void list( + Map shardAttributes, + DiscoveryNode[] nodes, + ActionListener listener + ) { + execute(new TransportNodesListShardStoreMetadataBatch.Request(shardAttributes, nodes), listener); + } + + @Override + protected NodeRequest newNodeRequest(Request request) { + return new NodeRequest(request); + } + + @Override + protected NodeStoreFilesMetadataBatch newNodeResponse(StreamInput in) throws IOException { + return new NodeStoreFilesMetadataBatch(in); + } + + @Override + protected NodesStoreFilesMetadataBatch newResponse( + Request request, + List responses, + List failures + ) { + return new NodesStoreFilesMetadataBatch(clusterService.getClusterName(), responses, failures); + } + + @Override + protected NodeStoreFilesMetadataBatch nodeOperation(NodeRequest request) { + try { + return new NodeStoreFilesMetadataBatch(clusterService.localNode(), listStoreMetadata(request)); + } catch (IOException e) { + throw new OpenSearchException( + "Failed to list store metadata for shards [" + request.getShardAttributes().keySet().stream().map(ShardId::toString) + "]", + e + ); + } + } + + /** + * This method is similar to listStoreMetadata method of {@link TransportNodesListShardStoreMetadata} + * In this case we fetch the shard store files for batch of shards instead of one shard. + */ + private Map listStoreMetadata(NodeRequest request) throws IOException { + Map shardStoreMetadataMap = new HashMap(); + for (Map.Entry shardAttributes : request.getShardAttributes().entrySet()) { + final ShardId shardId = shardAttributes.getKey(); + try { + StoreFilesMetadata storeFilesMetadata = TransportNodesListShardStoreMetadataHelper.listShardMetadataInternal( + logger, + shardId, + nodeEnv, + indicesService, + shardAttributes.getValue().getCustomDataPath(), + settings, + clusterService + ); + shardStoreMetadataMap.put(shardId, new NodeStoreFilesMetadata(storeFilesMetadata, null)); + } catch (Exception e) { + // should return null in case of known exceptions being returned from listShardMetadataInternal method. + if (e.getMessage().contains(INDEX_NOT_FOUND)) { + shardStoreMetadataMap.put(shardId, null); + } else { + // return actual exception as it is for unknown exceptions + shardStoreMetadataMap.put( + shardId, + new NodeStoreFilesMetadata( + new StoreFilesMetadata(shardId, Store.MetadataSnapshot.EMPTY, Collections.emptyList()), + e + ) + ); + } + } + } + return shardStoreMetadataMap; + } + + /** + * Request is used in constructing the request for making the transport request to set of other node. + * Refer {@link TransportNodesAction} class start method. + * + * @opensearch.internal + */ + public static class Request extends BaseNodesRequest { + + private final Map shardAttributes; + + public Request(StreamInput in) throws IOException { + super(in); + shardAttributes = in.readMap(ShardId::new, ShardAttributes::new); + } + + public Request(Map shardAttributes, DiscoveryNode[] nodes) { + super(nodes); + this.shardAttributes = Objects.requireNonNull(shardAttributes); + } + + public Map getShardAttributes() { + return shardAttributes; + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + super.writeTo(out); + out.writeMap(shardAttributes, (o, k) -> k.writeTo(o), (o, v) -> v.writeTo(o)); + } + } + + /** + * Metadata for the nodes store files + * + * @opensearch.internal + */ + public static class NodesStoreFilesMetadataBatch extends BaseNodesResponse { + + public NodesStoreFilesMetadataBatch(StreamInput in) throws IOException { + super(in); + } + + public NodesStoreFilesMetadataBatch( + ClusterName clusterName, + List nodes, + List failures + ) { + super(clusterName, nodes, failures); + } + + @Override + protected List readNodesFrom(StreamInput in) throws IOException { + return in.readList(NodeStoreFilesMetadataBatch::new); + } + + @Override + protected void writeNodesTo(StreamOutput out, List nodes) throws IOException { + out.writeList(nodes); + } + } + + /** + * The metadata for the node store files + * + * @opensearch.internal + */ + public static class NodeStoreFilesMetadata { + + private StoreFilesMetadata storeFilesMetadata; + private Exception storeFileFetchException; + + public NodeStoreFilesMetadata(StoreFilesMetadata storeFilesMetadata) { + this.storeFilesMetadata = storeFilesMetadata; + this.storeFileFetchException = null; + } + + public NodeStoreFilesMetadata(StreamInput in) throws IOException { + storeFilesMetadata = new StoreFilesMetadata(in); + if (in.readBoolean()) { + this.storeFileFetchException = in.readException(); + } else { + this.storeFileFetchException = null; + } + } + + public NodeStoreFilesMetadata(StoreFilesMetadata storeFilesMetadata, Exception storeFileFetchException) { + this.storeFilesMetadata = storeFilesMetadata; + this.storeFileFetchException = storeFileFetchException; + } + + public StoreFilesMetadata storeFilesMetadata() { + return storeFilesMetadata; + } + + public void writeTo(StreamOutput out) throws IOException { + storeFilesMetadata.writeTo(out); + if (storeFileFetchException != null) { + out.writeBoolean(true); + out.writeException(storeFileFetchException); + } else { + out.writeBoolean(false); + } + } + + public Exception getStoreFileFetchException() { + return storeFileFetchException; + } + + @Override + public String toString() { + return "[[" + storeFilesMetadata + "]]"; + } + } + + /** + * NodeRequest class is for deserializing the request received by this node from other node for this transport action. + * This is used in {@link TransportNodesAction} + * @opensearch.internal + */ + public static class NodeRequest extends TransportRequest { + + private final Map shardAttributes; + + public NodeRequest(StreamInput in) throws IOException { + super(in); + shardAttributes = in.readMap(ShardId::new, ShardAttributes::new); + } + + public NodeRequest(Request request) { + this.shardAttributes = Objects.requireNonNull(request.getShardAttributes()); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + super.writeTo(out); + out.writeMap(shardAttributes, (o, k) -> k.writeTo(o), (o, v) -> v.writeTo(o)); + } + + public Map getShardAttributes() { + return shardAttributes; + } + } + + /** + * NodeStoreFilesMetadataBatch Response received by the node from other node for this transport action. + * Refer {@link TransportNodesAction} + */ + public static class NodeStoreFilesMetadataBatch extends BaseNodeResponse { + private final Map nodeStoreFilesMetadataBatch; + + protected NodeStoreFilesMetadataBatch(StreamInput in) throws IOException { + super(in); + this.nodeStoreFilesMetadataBatch = in.readMap(ShardId::new, NodeStoreFilesMetadata::new); + } + + public NodeStoreFilesMetadataBatch(DiscoveryNode node, Map nodeStoreFilesMetadataBatch) { + super(node); + this.nodeStoreFilesMetadataBatch = nodeStoreFilesMetadataBatch; + } + + public Map getNodeStoreFilesMetadataBatch() { + return this.nodeStoreFilesMetadataBatch; + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + super.writeTo(out); + out.writeMap(nodeStoreFilesMetadataBatch, (o, k) -> k.writeTo(o), (o, v) -> v.writeTo(o)); + } + } + +} diff --git a/server/src/main/java/org/opensearch/indices/store/TransportNodesListShardStoreMetadataHelper.java b/server/src/main/java/org/opensearch/indices/store/TransportNodesListShardStoreMetadataHelper.java new file mode 100644 index 0000000000000..74b04d6c6d494 --- /dev/null +++ b/server/src/main/java/org/opensearch/indices/store/TransportNodesListShardStoreMetadataHelper.java @@ -0,0 +1,221 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.indices.store; + +import org.apache.logging.log4j.Logger; +import org.apache.logging.log4j.message.ParameterizedMessage; +import org.opensearch.OpenSearchException; +import org.opensearch.cluster.metadata.IndexMetadata; +import org.opensearch.cluster.node.DiscoveryNode; +import org.opensearch.cluster.service.ClusterService; +import org.opensearch.common.settings.Settings; +import org.opensearch.common.unit.TimeValue; +import org.opensearch.core.common.io.stream.StreamInput; +import org.opensearch.core.common.io.stream.StreamOutput; +import org.opensearch.core.common.io.stream.Writeable; +import org.opensearch.core.index.shard.ShardId; +import org.opensearch.env.NodeEnvironment; +import org.opensearch.index.IndexService; +import org.opensearch.index.IndexSettings; +import org.opensearch.index.seqno.ReplicationTracker; +import org.opensearch.index.seqno.RetentionLease; +import org.opensearch.index.shard.IndexShard; +import org.opensearch.index.shard.ShardPath; +import org.opensearch.index.store.Store; +import org.opensearch.index.store.StoreFileMetadata; +import org.opensearch.indices.IndicesService; + +import java.io.IOException; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.TimeUnit; + +/** + * This class has the common code used in {@link TransportNodesListShardStoreMetadata} and + * {@link TransportNodesListShardStoreMetadataBatch} to get the shard info on the local node. + *

+ * This class should not be used to add more functions and will be removed when the + * {@link TransportNodesListShardStoreMetadata} will be deprecated and all the code will be moved to + * {@link TransportNodesListShardStoreMetadataBatch} + * + * @opensearch.internal + */ +public class TransportNodesListShardStoreMetadataHelper { + + public static final String INDEX_NOT_FOUND = "node doesn't have meta data for index "; + + public static StoreFilesMetadata listShardMetadataInternal( + Logger logger, + final ShardId shardId, + NodeEnvironment nodeEnv, + IndicesService indicesService, + String customDataPath, + Settings settings, + ClusterService clusterService + ) throws IOException { + logger.trace("listing store meta data for {}", shardId); + long startTimeNS = System.nanoTime(); + boolean exists = false; + try { + IndexService indexService = indicesService.indexService(shardId.getIndex()); + if (indexService != null) { + IndexShard indexShard = indexService.getShardOrNull(shardId.id()); + if (indexShard != null) { + try { + final StoreFilesMetadata storeFilesMetadata = new StoreFilesMetadata( + shardId, + indexShard.snapshotStoreMetadata(), + indexShard.getPeerRecoveryRetentionLeases() + ); + exists = true; + return storeFilesMetadata; + } catch (org.apache.lucene.index.IndexNotFoundException e) { + logger.trace(new ParameterizedMessage("[{}] node is missing index, responding with empty", shardId), e); + return new StoreFilesMetadata(shardId, Store.MetadataSnapshot.EMPTY, Collections.emptyList()); + } catch (IOException e) { + logger.warn(new ParameterizedMessage("[{}] can't read metadata from store, responding with empty", shardId), e); + return new StoreFilesMetadata(shardId, Store.MetadataSnapshot.EMPTY, Collections.emptyList()); + } + } + } + if (customDataPath == null) { + // TODO: Fallback for BWC with older predecessor (ES) versions. + // Remove this once request.getCustomDataPath() always returns non-null + if (indexService != null) { + customDataPath = indexService.getIndexSettings().customDataPath(); + } else { + IndexMetadata metadata = clusterService.state().metadata().index(shardId.getIndex()); + if (metadata != null) { + customDataPath = new IndexSettings(metadata, settings).customDataPath(); + } else { + logger.trace("{} node doesn't have meta data for the requests index", shardId); + throw new OpenSearchException(INDEX_NOT_FOUND + shardId.getIndex()); + } + } + } + final ShardPath shardPath = ShardPath.loadShardPath(logger, nodeEnv, shardId, customDataPath); + if (shardPath == null) { + return new StoreFilesMetadata(shardId, Store.MetadataSnapshot.EMPTY, Collections.emptyList()); + } + // note that this may fail if it can't get access to the shard lock. Since we check above there is an active shard, this means: + // 1) a shard is being constructed, which means the cluster-manager will not use a copy of this replica + // 2) A shard is shutting down and has not cleared it's content within lock timeout. In this case the cluster-manager may not + // reuse local resources. + final Store.MetadataSnapshot metadataSnapshot = Store.readMetadataSnapshot( + shardPath.resolveIndex(), + shardId, + nodeEnv::shardLock, + logger + ); + // We use peer recovery retention leases from the primary for allocating replicas. We should always have retention leases when + // we refresh shard info after the primary has started. Hence, we can ignore retention leases if there is no active shard. + return new StoreFilesMetadata(shardId, metadataSnapshot, Collections.emptyList()); + } finally { + TimeValue took = new TimeValue(System.nanoTime() - startTimeNS, TimeUnit.NANOSECONDS); + if (exists) { + logger.debug("{} loaded store meta data (took [{}])", shardId, took); + } else { + logger.trace("{} didn't find any store meta data to load (took [{}])", shardId, took); + } + } + } + + /** + * Metadata for store files + * + * @opensearch.internal + */ + public static class StoreFilesMetadata implements Iterable, Writeable { + private final ShardId shardId; + private final Store.MetadataSnapshot metadataSnapshot; + private final List peerRecoveryRetentionLeases; + + public StoreFilesMetadata( + ShardId shardId, + Store.MetadataSnapshot metadataSnapshot, + List peerRecoveryRetentionLeases + ) { + this.shardId = shardId; + this.metadataSnapshot = metadataSnapshot; + this.peerRecoveryRetentionLeases = peerRecoveryRetentionLeases; + } + + public StoreFilesMetadata(StreamInput in) throws IOException { + this.shardId = new ShardId(in); + this.metadataSnapshot = new Store.MetadataSnapshot(in); + this.peerRecoveryRetentionLeases = in.readList(RetentionLease::new); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + shardId.writeTo(out); + metadataSnapshot.writeTo(out); + out.writeList(peerRecoveryRetentionLeases); + } + + public ShardId shardId() { + return this.shardId; + } + + public boolean isEmpty() { + return metadataSnapshot.size() == 0; + } + + @Override + public Iterator iterator() { + return metadataSnapshot.iterator(); + } + + public boolean fileExists(String name) { + return metadataSnapshot.asMap().containsKey(name); + } + + public StoreFileMetadata file(String name) { + return metadataSnapshot.asMap().get(name); + } + + /** + * Returns the retaining sequence number of the peer recovery retention lease for a given node if exists; otherwise, returns -1. + */ + public long getPeerRecoveryRetentionLeaseRetainingSeqNo(DiscoveryNode node) { + assert node != null; + final String retentionLeaseId = ReplicationTracker.getPeerRecoveryRetentionLeaseId(node.getId()); + return peerRecoveryRetentionLeases.stream() + .filter(lease -> lease.id().equals(retentionLeaseId)) + .mapToLong(RetentionLease::retainingSequenceNumber) + .findFirst() + .orElse(-1L); + } + + public List peerRecoveryRetentionLeases() { + return peerRecoveryRetentionLeases; + } + + /** + * @return commit sync id if exists, else null + */ + public String syncId() { + return metadataSnapshot.getSyncId(); + } + + @Override + public String toString() { + return "StoreFilesMetadata{" + + ", shardId=" + + shardId + + ", metadataSnapshot{size=" + + metadataSnapshot.size() + + ", syncId=" + + metadataSnapshot.getSyncId() + + "}" + + '}'; + } + } +} diff --git a/server/src/test/java/org/opensearch/gateway/ReplicaShardAllocatorTests.java b/server/src/test/java/org/opensearch/gateway/ReplicaShardAllocatorTests.java index 5833d9c4f187f..ae56bc0f8b3d2 100644 --- a/server/src/test/java/org/opensearch/gateway/ReplicaShardAllocatorTests.java +++ b/server/src/test/java/org/opensearch/gateway/ReplicaShardAllocatorTests.java @@ -68,6 +68,7 @@ import org.opensearch.index.store.Store; import org.opensearch.index.store.StoreFileMetadata; import org.opensearch.indices.store.TransportNodesListShardStoreMetadata; +import org.opensearch.indices.store.TransportNodesListShardStoreMetadataHelper.StoreFilesMetadata; import org.opensearch.snapshots.SnapshotShardSizeInfo; import org.junit.Before; @@ -665,7 +666,7 @@ static String randomSyncId() { class TestAllocator extends ReplicaShardAllocator { - private Map data = null; + private Map data = null; private AtomicBoolean fetchDataCalled = new AtomicBoolean(false); public void clean() { @@ -703,7 +704,7 @@ TestAllocator addData( } data.put( node, - new TransportNodesListShardStoreMetadata.StoreFilesMetadata( + new StoreFilesMetadata( shardId, new Store.MetadataSnapshot(unmodifiableMap(filesAsMap), unmodifiableMap(commitData), randomInt()), peerRecoveryRetentionLeases @@ -721,7 +722,7 @@ protected AsyncShardFetch.FetchResult tData = null; if (data != null) { tData = new HashMap<>(); - for (Map.Entry entry : data.entrySet()) { + for (Map.Entry entry : data.entrySet()) { tData.put( entry.getKey(), new TransportNodesListShardStoreMetadata.NodeStoreFilesMetadata(entry.getKey(), entry.getValue()) diff --git a/server/src/test/java/org/opensearch/index/store/StoreTests.java b/server/src/test/java/org/opensearch/index/store/StoreTests.java index d7d326b325cc6..ab30a4c1c435f 100644 --- a/server/src/test/java/org/opensearch/index/store/StoreTests.java +++ b/server/src/test/java/org/opensearch/index/store/StoreTests.java @@ -87,7 +87,7 @@ import org.opensearch.index.shard.ShardPath; import org.opensearch.index.translog.Translog; import org.opensearch.indices.replication.common.ReplicationType; -import org.opensearch.indices.store.TransportNodesListShardStoreMetadata; +import org.opensearch.indices.store.TransportNodesListShardStoreMetadataHelper.StoreFilesMetadata; import org.opensearch.test.DummyShardLock; import org.opensearch.test.FeatureFlagSetter; import org.opensearch.test.IndexSettingsModule; @@ -980,12 +980,11 @@ public void testStreamStoreFilesMetadata() throws Exception { ) ); } - TransportNodesListShardStoreMetadata.StoreFilesMetadata outStoreFileMetadata = - new TransportNodesListShardStoreMetadata.StoreFilesMetadata( - new ShardId("test", "_na_", 0), - metadataSnapshot, - peerRecoveryRetentionLeases - ); + StoreFilesMetadata outStoreFileMetadata = new StoreFilesMetadata( + new ShardId("test", "_na_", 0), + metadataSnapshot, + peerRecoveryRetentionLeases + ); ByteArrayOutputStream outBuffer = new ByteArrayOutputStream(); OutputStreamStreamOutput out = new OutputStreamStreamOutput(outBuffer); org.opensearch.Version targetNodeVersion = randomVersion(random()); @@ -994,8 +993,7 @@ public void testStreamStoreFilesMetadata() throws Exception { ByteArrayInputStream inBuffer = new ByteArrayInputStream(outBuffer.toByteArray()); InputStreamStreamInput in = new InputStreamStreamInput(inBuffer); in.setVersion(targetNodeVersion); - TransportNodesListShardStoreMetadata.StoreFilesMetadata inStoreFileMetadata = - new TransportNodesListShardStoreMetadata.StoreFilesMetadata(in); + StoreFilesMetadata inStoreFileMetadata = new StoreFilesMetadata(in); Iterator outFiles = outStoreFileMetadata.iterator(); for (StoreFileMetadata inFile : inStoreFileMetadata) { assertThat(inFile.name(), equalTo(outFiles.next().name()));