Skip to content

Commit

Permalink
Checks for problems in Accumulo (apache#4957)
Browse files Browse the repository at this point in the history
* Checks for problems in Accumulo

This partially completes apache#4892:

- Moves existing checks (`checkTablets` and the fate check for dangling locks) into the appropriate new `admin check` command
- Adds new checks
- New tests in AdminCheckIT
- Created new check TABLE_LOCKS which checks for
	- valid locked table/namespace ids (the locked table/namespaces exist)
	- locked table/namespaces are associated with a fate op
- ROOT_METADATA now checks for
	- offline tablets
	- missing "columns"
	- invalid "columns"
- ROOT_TABLE now checks for
	- offline tablets
	- tablets for metadata table have no holes, valid (null) prev end row for first tablet, and valid (null) end row for last tablet
	- missing columns
	- invalid columns
- METADATA_TABLE now checks for
	- offline tablets
	- tablets for user tables (and scanref) have no holes, valid (null) prev end row for first tablet, and valid (null) end row for last tablet
	- missing columns
	- invalid columns
- SYSTEM_FILES now checks for
	- missing system files
- USER_FILES now checks for
	- missing user files
  • Loading branch information
kevinrr888 authored Dec 10, 2024
1 parent 96c4582 commit c0979fd
Show file tree
Hide file tree
Showing 18 changed files with 941 additions and 188 deletions.
12 changes: 0 additions & 12 deletions core/src/main/java/org/apache/accumulo/core/fate/AdminUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -424,18 +424,6 @@ public void print(ReadOnlyTStore<T> zs, ZooReader zk, ServiceLock.ServiceLockPat
txStatus.getWaitingLocks(), txStatus.getTop(), txStatus.getTimeCreatedFormatted());
}
fmt.format(" %s transactions", fateStatus.getTransactions().size());

if (!fateStatus.getDanglingHeldLocks().isEmpty()
|| !fateStatus.getDanglingWaitingLocks().isEmpty()) {
fmt.format("%nThe following locks did not have an associated FATE operation%n");
for (Entry<String,List<String>> entry : fateStatus.getDanglingHeldLocks().entrySet()) {
fmt.format("txid: %s locked: %s%n", entry.getKey(), entry.getValue());
}

for (Entry<String,List<String>> entry : fateStatus.getDanglingWaitingLocks().entrySet()) {
fmt.format("txid: %s locking: %s%n", entry.getKey(), entry.getValue());
}
}
}

public boolean prepDelete(TStore<T> zs, ZooReaderWriter zk, ServiceLockPath path,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,9 @@ private static byte[] readField(DataInputStream din) throws IOException {
return b;
}

// decode a bunch of key value pairs that have been encoded into a single value
/**
* decode a bunch of key value pairs that have been encoded into a single value
*/
public static final SortedMap<Key,Value> decodeRow(Key rowKey, Value rowValue)
throws IOException {
SortedMap<Key,Value> map = new TreeMap<>();
Expand All @@ -110,8 +112,10 @@ private static void encode(DataOutputStream dout, ByteSequence bs) throws IOExce
dout.write(bs.getBackingArray(), bs.offset(), bs.length());
}

// take a stream of keys and values and output a value that encodes everything but their row
// keys and values must be paired one for one
/**
* take a stream of keys and values and output a value that encodes everything but their row keys
* and values must be paired one for one
*/
public static final Value encodeRow(List<Key> keys, List<Value> values) throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
DataOutputStream dout = new DataOutputStream(out);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@
import org.apache.accumulo.server.util.checkCommand.RootTableCheckRunner;
import org.apache.accumulo.server.util.checkCommand.SystemConfigCheckRunner;
import org.apache.accumulo.server.util.checkCommand.SystemFilesCheckRunner;
import org.apache.accumulo.server.util.checkCommand.TableLocksCheckRunner;
import org.apache.accumulo.server.util.checkCommand.UserFilesCheckRunner;
import org.apache.accumulo.server.util.fateCommand.FateSummaryReport;
import org.apache.accumulo.start.spi.KeywordExecutable;
Expand Down Expand Up @@ -145,6 +146,10 @@ public static class CheckCommand {
@Parameter(description = "[<Check>...]")
List<String> checks;

@Parameter(names = "--fixFiles", description = "Removes dangling file pointers. Used by the "
+ "USER_FILES and SYSTEM_FILES checks.")
boolean fixFiles = false;

/**
* This should be used to get the check runner instead of {@link Check#getCheckRunner()}. This
* exists so that its functionality can be changed for testing.
Expand All @@ -159,6 +164,9 @@ public enum Check {
// Caution should be taken when changing or adding any new checks: order is important
SYSTEM_CONFIG(SystemConfigCheckRunner::new, "Validate the system config stored in ZooKeeper",
Collections.emptyList()),
TABLE_LOCKS(TableLocksCheckRunner::new,
"Ensures that table and namespace locks are valid and are associated with a FATE op",
Collections.singletonList(SYSTEM_CONFIG)),
ROOT_METADATA(RootMetadataCheckRunner::new,
"Checks integrity of the root tablet metadata stored in ZooKeeper",
Collections.singletonList(SYSTEM_CONFIG)),
Expand Down Expand Up @@ -214,16 +222,6 @@ public enum CheckStatus {
}
}

@Parameters(commandDescription = "print tablets that are offline in online tables")
static class CheckTabletsCommand {
@Parameter(names = "--fixFiles", description = "Remove dangling file pointers")
boolean fixFiles = false;

@Parameter(names = {"-t", "--table"},
description = "Table to check, if not set checks all tables")
String tableName = null;
}

@Parameters(commandDescription = "stop the manager")
static class StopManagerCommand {}

Expand Down Expand Up @@ -377,9 +375,6 @@ public void execute(final String[] args) {
CheckCommand checkCommand = new CheckCommand();
cl.addCommand("check", checkCommand);

CheckTabletsCommand checkTabletsCommand = new CheckTabletsCommand();
cl.addCommand("checkTablets", checkTabletsCommand);

DeleteZooInstanceCommand deleteZooInstOpts = new DeleteZooInstanceCommand();
cl.addCommand("deleteZooInstance", deleteZooInstOpts);

Expand Down Expand Up @@ -441,24 +436,6 @@ public void execute(final String[] args) {
if (ping(context, pingCommand.args) != 0) {
rc = 4;
}
} else if (cl.getParsedCommand().equals("checkTablets")) {
System.out.println("\n*** Looking for offline tablets ***\n");
if (FindOfflineTablets.findOffline(context, checkTabletsCommand.tableName) != 0) {
rc = 5;
}
System.out.println("\n*** Looking for missing files ***\n");
if (checkTabletsCommand.tableName == null) {
if (RemoveEntriesForMissingFiles.checkAllTables(context, checkTabletsCommand.fixFiles)
!= 0) {
rc = 6;
}
} else {
if (RemoveEntriesForMissingFiles.checkTable(context, checkTabletsCommand.tableName,
checkTabletsCommand.fixFiles) != 0) {
rc = 6;
}
}

} else if (cl.getParsedCommand().equals("stop")) {
stopTabletServer(context, stopOpts.args, opts.force);
} else if (cl.getParsedCommand().equals("dumpConfig")) {
Expand All @@ -482,7 +459,7 @@ public void execute(final String[] args) {
} else if (cl.getParsedCommand().equals("serviceStatus")) {
printServiceStatus(context, serviceStatusCommandOpts);
} else if (cl.getParsedCommand().equals("check")) {
executeCheckCommand(context, checkCommand);
executeCheckCommand(context, checkCommand, opts);
} else {
everything = cl.getParsedCommand().equals("stopAll");

Expand Down Expand Up @@ -1012,15 +989,16 @@ private EnumSet<ReadOnlyTStore.TStatus> getCmdLineStatusFilters(List<String> sta
}

@VisibleForTesting
public static void executeCheckCommand(ServerContext context, CheckCommand cmd) {
public static void executeCheckCommand(ServerContext context, CheckCommand cmd,
ServerUtilOpts opts) throws Exception {
validateAndTransformCheckCommand(cmd);

if (cmd.list) {
listChecks();
} else if (cmd.run) {
var givenChecks =
cmd.checks.stream().map(CheckCommand.Check::valueOf).collect(Collectors.toList());
executeRunCheckCommand(cmd, givenChecks);
var givenChecks = cmd.checks.stream()
.map(name -> CheckCommand.Check.valueOf(name.toUpperCase())).collect(Collectors.toList());
executeRunCheckCommand(cmd, givenChecks, context, opts);
}
}

Expand Down Expand Up @@ -1051,19 +1029,19 @@ private static void validateAndTransformCheckCommand(CheckCommand cmd) {

private static void listChecks() {
System.out.println();
System.out.printf("%-20s | %-80s | %-20s%n", "Check Name", "Description", "Depends on");
System.out.println("-".repeat(120));
System.out.printf("%-20s | %-90s | %-20s%n", "Check Name", "Description", "Depends on");
System.out.println("-".repeat(130));
for (CheckCommand.Check check : CheckCommand.Check.values()) {
System.out.printf("%-20s | %-80s | %-20s%n", check.name(), check.getDescription(),
System.out.printf("%-20s | %-90s | %-20s%n", check.name(), check.getDescription(),
check.getDependencies().stream().map(CheckCommand.Check::name)
.collect(Collectors.joining(", ")));
}
System.out.println("-".repeat(120));
System.out.println("-".repeat(130));
System.out.println();
}

private static void executeRunCheckCommand(CheckCommand cmd,
List<CheckCommand.Check> givenChecks) {
private static void executeRunCheckCommand(CheckCommand cmd, List<CheckCommand.Check> givenChecks,
ServerContext context, ServerUtilOpts opts) throws Exception {
// Get all the checks in the order they are declared in the enum
final var allChecks = CheckCommand.Check.values();
final TreeMap<CheckCommand.Check,CheckCommand.CheckStatus> checkStatus = new TreeMap<>();
Expand All @@ -1073,7 +1051,7 @@ private static void executeRunCheckCommand(CheckCommand cmd,
checkStatus.put(check, CheckCommand.CheckStatus.SKIPPED_DEPENDENCY_FAILED);
} else {
if (givenChecks.contains(check)) {
checkStatus.put(check, cmd.getCheckRunner(check).runCheck());
checkStatus.put(check, cmd.getCheckRunner(check).runCheck(context, opts, cmd.fixFiles));
} else {
checkStatus.put(check, CheckCommand.CheckStatus.FILTERED_OUT);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeSet;
import java.util.function.Consumer;

import org.apache.accumulo.core.client.Accumulo;
import org.apache.accumulo.core.client.AccumuloClient;
Expand All @@ -45,75 +46,83 @@
import io.opentelemetry.context.Scope;

public class CheckForMetadataProblems {
private static boolean sawProblems = false;
private static ServerUtilOpts opts;

private static void checkTable(TableId tableId, TreeSet<KeyExtent> tablets) {
private static boolean checkTable(TableId tableId, TreeSet<KeyExtent> tablets,
ServerUtilOpts opts, Consumer<String> printInfoMethod, Consumer<String> printProblemMethod) {
// sanity check of metadata table entries
// make sure tablets has no holes, and that it starts and ends w/ null
// make sure tablets have no holes, and that it starts and ends w/ null
String tableName;
boolean sawProblems = false;

try {
tableName = opts.getServerContext().getTableName(tableId);
} catch (TableNotFoundException e) {
tableName = null;
}

printInfoMethod.accept(String.format("Ensuring tablets for table %s (%s) have: no holes, "
+ "valid (null) prev end row for first tablet, and valid (null) end row "
+ "for last tablet...\n", tableName, tableId));

if (tablets.isEmpty()) {
System.out.println(
"...No entries found in metadata table for table " + tableName + " (" + tableId + ")");
sawProblems = true;
return;
printProblemMethod.accept(String
.format("...No entries found in metadata table for table %s (%s)", tableName, tableId));
return true;
}

if (tablets.first().prevEndRow() != null) {
System.out.println("...First entry for table " + tableName + " (" + tableId + ") - "
+ tablets.first() + " - has non null prev end row");
sawProblems = true;
return;
printProblemMethod
.accept(String.format("...First entry for table %s (%s) - %s - has non-null prev end row",
tableName, tableId, tablets.first()));
return true;
}

if (tablets.last().endRow() != null) {
System.out.println("...Last entry for table " + tableName + " (" + tableId + ") - "
+ tablets.last() + " - has non null end row");
sawProblems = true;
return;
printProblemMethod
.accept(String.format("...Last entry for table %s (%s) - %s - has non-null end row",
tableName, tableId, tablets.last()));
return true;
}

Iterator<KeyExtent> tabIter = tablets.iterator();
Text lastEndRow = tabIter.next().endRow();
boolean everythingLooksGood = true;
while (tabIter.hasNext()) {
KeyExtent tabke = tabIter.next();
KeyExtent table = tabIter.next();
boolean broke = false;
if (tabke.prevEndRow() == null) {
System.out.println("...Table " + tableName + " (" + tableId
+ ") has null prev end row in middle of table " + tabke);
if (table.prevEndRow() == null) {
printProblemMethod
.accept(String.format("...Table %s (%s) has null prev end row in middle of table %s",
tableName, tableId, table));
broke = true;
} else if (!tabke.prevEndRow().equals(lastEndRow)) {
System.out.println("...Table " + tableName + " (" + tableId + ") has a hole "
+ tabke.prevEndRow() + " != " + lastEndRow);
} else if (!table.prevEndRow().equals(lastEndRow)) {
printProblemMethod.accept(String.format("...Table %s (%s) has a hole %s != %s", tableName,
tableId, table.prevEndRow(), lastEndRow));
broke = true;
}
if (broke) {
everythingLooksGood = false;
}

lastEndRow = tabke.endRow();
lastEndRow = table.endRow();
}
if (everythingLooksGood) {
System.out.println("...All is well for table " + tableName + " (" + tableId + ")");
printInfoMethod.accept(String.format("...All is well for table %s (%s)", tableName, tableId));
} else {
sawProblems = true;
}

return sawProblems;
}

private static void checkMetadataAndRootTableEntries(String tableNameToCheck, ServerUtilOpts opts)
public static boolean checkMetadataAndRootTableEntries(String tableNameToCheck,
ServerUtilOpts opts, Consumer<String> printInfoMethod, Consumer<String> printProblemMethod)
throws Exception {
TableId tableCheckId = opts.getServerContext().getTableId(tableNameToCheck);
System.out.println("Checking tables whose metadata is found in: " + tableNameToCheck + " ("
+ tableCheckId + ")");
printInfoMethod.accept(String.format("Checking tables whose metadata is found in: %s (%s)...\n",
tableNameToCheck, tableCheckId));
Map<TableId,TreeSet<KeyExtent>> tables = new HashMap<>();
boolean sawProblems = false;

try (AccumuloClient client = Accumulo.newClient().from(opts.getClientProps()).build();
Scanner scanner = client.createScanner(tableNameToCheck, Authorizations.EMPTY)) {
Expand All @@ -139,7 +148,10 @@ private static void checkMetadataAndRootTableEntries(String tableNameToCheck, Se
TreeSet<KeyExtent> tablets = tables.get(tableId);
if (tablets == null) {

tables.forEach(CheckForMetadataProblems::checkTable);
for (var e : tables.entrySet()) {
sawProblems = CheckForMetadataProblems.checkTable(e.getKey(), e.getValue(), opts,
printInfoMethod, printProblemMethod) || sawProblems;
}

tables.clear();

Expand All @@ -153,37 +165,45 @@ private static void checkMetadataAndRootTableEntries(String tableNameToCheck, Se
justLoc = false;
} else if (colf.equals(CurrentLocationColumnFamily.NAME)) {
if (justLoc) {
System.out.println("Problem at key " + entry.getKey());
printProblemMethod.accept("Problem at key " + entry.getKey());
sawProblems = true;
}
justLoc = true;
}
}

if (count == 0) {
System.err
.println("ERROR : table " + tableNameToCheck + " (" + tableCheckId + ") is empty");
printProblemMethod.accept(
String.format("ERROR : table %s (%s) is empty", tableNameToCheck, tableCheckId));
sawProblems = true;
}
}

tables.forEach(CheckForMetadataProblems::checkTable);
for (var e : tables.entrySet()) {
sawProblems = CheckForMetadataProblems.checkTable(e.getKey(), e.getValue(), opts,
printInfoMethod, printProblemMethod) || sawProblems;
}

if (!sawProblems) {
System.out.println("No problems found in " + tableNameToCheck + " (" + tableCheckId + ")");
printInfoMethod.accept(
String.format("\n...No problems found in %s (%s)", tableNameToCheck, tableCheckId));
}
// end METADATA table sanity check
return sawProblems;
}

public static void main(String[] args) throws Exception {
opts = new ServerUtilOpts();
ServerUtilOpts opts = new ServerUtilOpts();
opts.parseArgs(CheckForMetadataProblems.class.getName(), args);
Span span = TraceUtil.startSpan(CheckForMetadataProblems.class, "main");
boolean sawProblems;
try (Scope scope = span.makeCurrent()) {

checkMetadataAndRootTableEntries(AccumuloTable.ROOT.tableName(), opts);
sawProblems = checkMetadataAndRootTableEntries(AccumuloTable.ROOT.tableName(), opts,
System.out::println, System.out::println);
System.out.println();
checkMetadataAndRootTableEntries(AccumuloTable.METADATA.tableName(), opts);
sawProblems = checkMetadataAndRootTableEntries(AccumuloTable.METADATA.tableName(), opts,
System.out::println, System.out::println) || sawProblems;
if (sawProblems) {
throw new IllegalStateException();
}
Expand Down
Loading

0 comments on commit c0979fd

Please sign in to comment.