From 9e23b57e8f233d139d0bc61385377323d6e5831d Mon Sep 17 00:00:00 2001 From: Kathy Tran Date: Mon, 11 Dec 2023 14:35:58 -0500 Subject: [PATCH 01/19] Ingest broad metrics using apache csv --- THIRD-PARTY-LICENSES.txt | 18 +- metricsaggregator/pom.xml | 14 + .../client/cli/CommandLineArgs.java | 43 ++- .../client/cli/MetricsAggregatorClient.java | 54 ++- .../client/cli/TerraMetricsSubmitter.java | 326 ++++++++++++++++++ .../src/main/resources/logback.xml | 2 +- .../client/cli/TerraMetricsSubmitterTest.java | 40 +++ pom.xml | 11 +- topicgenerator/pom.xml | 1 - utils/pom.xml | 5 + .../utils/DockstoreApiClientUtils.java | 16 + 11 files changed, 496 insertions(+), 34 deletions(-) create mode 100644 metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java create mode 100644 metricsaggregator/src/test/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitterTest.java create mode 100644 utils/src/main/java/io/dockstore/utils/DockstoreApiClientUtils.java diff --git a/THIRD-PARTY-LICENSES.txt b/THIRD-PARTY-LICENSES.txt index eefa08d2..32349b1a 100644 --- a/THIRD-PARTY-LICENSES.txt +++ b/THIRD-PARTY-LICENSES.txt @@ -131,10 +131,10 @@ Lists of 417 third-party dependencies. (The Apache Software License, Version 2.0) docker-java-core (com.github.docker-java:docker-java-core:3.3.0 - https://github.com/docker-java/docker-java) (The Apache Software License, Version 2.0) docker-java-transport (com.github.docker-java:docker-java-transport:3.3.0 - https://github.com/docker-java/docker-java) (The Apache Software License, Version 2.0) docker-java-transport-httpclient5 (com.github.docker-java:docker-java-transport-httpclient5:3.3.0 - https://github.com/docker-java/docker-java) - (Apache Software License, Version 2.0) dockstore-common (io.dockstore:dockstore-common:1.15.0-alpha.13 - no url defined) - (Apache Software License, Version 2.0) dockstore-integration-testing (io.dockstore:dockstore-integration-testing:1.15.0-alpha.13 - no url defined) - (Apache Software License, Version 2.0) dockstore-language-plugin-parent (io.dockstore:dockstore-language-plugin-parent:1.15.0-alpha.13 - no url defined) - (Apache Software License, Version 2.0) dockstore-webservice (io.dockstore:dockstore-webservice:1.15.0-alpha.13 - no url defined) + (Apache Software License, Version 2.0) dockstore-common (io.dockstore:dockstore-common:1.15.0-SNAPSHOT - no url defined) + (Apache Software License, Version 2.0) dockstore-integration-testing (io.dockstore:dockstore-integration-testing:1.15.0-SNAPSHOT - no url defined) + (Apache Software License, Version 2.0) dockstore-language-plugin-parent (io.dockstore:dockstore-language-plugin-parent:1.15.0-SNAPSHOT - no url defined) + (Apache Software License, Version 2.0) dockstore-webservice (io.dockstore:dockstore-webservice:1.15.0-SNAPSHOT - no url defined) (Apache License 2.0) Dropwizard (io.dropwizard:dropwizard-core:4.0.2 - http://www.dropwizard.io/4.0.2/dropwizard-bom/dropwizard-dependencies/dropwizard-parent/dropwizard-core) (Apache License 2.0) Dropwizard Asset Bundle (io.dropwizard:dropwizard-assets:4.0.2 - http://www.dropwizard.io/4.0.2/dropwizard-bom/dropwizard-dependencies/dropwizard-parent/dropwizard-assets) (Apache License 2.0) Dropwizard Authentication (io.dropwizard:dropwizard-auth:4.0.2 - http://www.dropwizard.io/4.0.2/dropwizard-bom/dropwizard-dependencies/dropwizard-parent/dropwizard-auth) @@ -309,9 +309,9 @@ Lists of 417 third-party dependencies. (MIT License) liquibase-slf4j (com.mattbertolini:liquibase-slf4j:5.0.0 - https://github.com/mattbertolini/liquibase-slf4j) (Apache License 2.0) localstack-utils (cloud.localstack:localstack-utils:0.2.22 - http://localstack.cloud) (Apache Software Licenses) Log4j Implemented Over SLF4J (org.slf4j:log4j-over-slf4j:2.0.9 - http://www.slf4j.org) - (Eclipse Public License - v 1.0) (GNU Lesser General Public License) Logback Access Module (ch.qos.logback:logback-access:1.4.11 - http://logback.qos.ch/logback-access) - (Eclipse Public License - v 1.0) (GNU Lesser General Public License) Logback Classic Module (ch.qos.logback:logback-classic:1.4.11 - http://logback.qos.ch/logback-classic) - (Eclipse Public License - v 1.0) (GNU Lesser General Public License) Logback Core Module (ch.qos.logback:logback-core:1.4.11 - http://logback.qos.ch/logback-core) + (Eclipse Public License - v 1.0) (GNU Lesser General Public License) Logback Access Module (ch.qos.logback:logback-access:1.4.12 - http://logback.qos.ch/logback-access) + (Eclipse Public License - v 1.0) (GNU Lesser General Public License) Logback Classic Module (ch.qos.logback:logback-classic:1.4.12 - http://logback.qos.ch/logback-classic) + (Eclipse Public License - v 1.0) (GNU Lesser General Public License) Logback Core Module (ch.qos.logback:logback-core:1.4.12 - http://logback.qos.ch/logback-core) (Apache License, Version 2.0) (MIT License) Logstash Logback Encoder (net.logstash.logback:logstash-logback-encoder:4.11 - https://github.com/logstash/logstash-logback-encoder) (Apache License, Version 2.0) Lucene Core (org.apache.lucene:lucene-core:8.7.0 - https://lucene.apache.org/lucene-parent/lucene-core) (MIT) mbknor-jackson-jsonSchema (com.kjetland:mbknor-jackson-jsonschema_2.12:1.0.34 - https://github.com/mbknor/mbknor-jackson-jsonSchema) @@ -354,7 +354,7 @@ Lists of 417 third-party dependencies. (Apache License, Version 2.0) Objenesis (org.objenesis:objenesis:3.2 - http://objenesis.org/objenesis) (The Apache Software License, Version 2.0) okhttp (com.squareup.okhttp3:okhttp:4.10.0 - https://square.github.io/okhttp/) (The Apache Software License, Version 2.0) okio (com.squareup.okio:okio-jvm:3.0.0 - https://github.com/square/okio/) - (Apache Software License, Version 2.0) openapi-java-client (io.dockstore:openapi-java-client:1.15.0-alpha.13 - no url defined) + (Apache Software License, Version 2.0) openapi-java-client (io.dockstore:openapi-java-client:1.15.0-SNAPSHOT - no url defined) (The Apache License, Version 2.0) OpenCensus (io.opencensus:opencensus-api:0.31.0 - https://github.com/census-instrumentation/opencensus-java) (Apache 2) opencsv (com.opencsv:opencsv:5.7.1 - http://opencsv.sf.net) (Apache 2.0) optics (io.circe:circe-optics_2.13:0.14.1 - https://github.com/circe/circe-optics) @@ -395,7 +395,7 @@ Lists of 417 third-party dependencies. (Apache License 2.0) swagger-core-jakarta (io.swagger.core.v3:swagger-core-jakarta:2.2.15 - https://github.com/swagger-api/swagger-core/modules/swagger-core-jakarta) (Apache License 2.0) swagger-integration-jakarta (io.swagger.core.v3:swagger-integration-jakarta:2.2.15 - https://github.com/swagger-api/swagger-core/modules/swagger-integration-jakarta) (Apache Software License, Version 2.0) swagger-java-bitbucket-client (io.dockstore:swagger-java-bitbucket-client:2.0.3 - no url defined) - (Apache Software License, Version 2.0) swagger-java-client (io.dockstore:swagger-java-client:1.15.0-alpha.13 - no url defined) + (Apache Software License, Version 2.0) swagger-java-client (io.dockstore:swagger-java-client:1.15.0-SNAPSHOT - no url defined) (Apache Software License, Version 2.0) swagger-java-discourse-client (io.dockstore:swagger-java-discourse-client:2.0.1 - no url defined) (Apache Software License, Version 2.0) swagger-java-quay-client (io.dockstore:swagger-java-quay-client:2.0.2 - no url defined) (Apache Software License, Version 2.0) swagger-java-sam-client (io.dockstore:swagger-java-sam-client:2.0.2 - no url defined) diff --git a/metricsaggregator/pom.xml b/metricsaggregator/pom.xml index 7a525b01..362f0e83 100644 --- a/metricsaggregator/pom.xml +++ b/metricsaggregator/pom.xml @@ -132,6 +132,14 @@ org.slf4j slf4j-api + + ch.qos.logback + logback-core + + + ch.qos.logback + logback-classic + software.amazon.awssdk s3 @@ -144,6 +152,10 @@ javax.money money-api + + org.apache.commons + commons-csv + org.junit.jupiter junit-jupiter-api @@ -278,6 +290,8 @@ org.glassfish.jersey.inject:jersey-hk2 javax.money:money-api org.javamoney.moneta:moneta-core + ch.qos.logback:logback-classic + ch.qos.logback:logback-core diff --git a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/CommandLineArgs.java b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/CommandLineArgs.java index f84c0bd7..9b78c444 100644 --- a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/CommandLineArgs.java +++ b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/CommandLineArgs.java @@ -42,7 +42,7 @@ public File getConfig() { } @Parameters(commandNames = { "submit-validation-data" }, commandDescription = "Formats workflow validation data specified in a file then submits it to Dockstore") - public static class SubmitValidationData extends CommandLineArgs { + public static class SubmitValidationData extends CommandLineArgs { @Parameter(names = {"-c", "--config"}, description = "The config file path.") private File config = new File("./" + MetricsAggregatorClient.CONFIG_FILE_NAME); @@ -78,4 +78,45 @@ public Partner getPlatform() { return platform; } } + + @Parameters(commandNames = { "submit-terra-metrics" }, commandDescription = "Formats workflow validation data specified in a file then submits it to Dockstore") + public static class SubmitTerraMetrics extends CommandLineArgs { + @Parameter(names = {"-c", "--config"}, description = "The config file path.") + private File config = new File("./" + MetricsAggregatorClient.CONFIG_FILE_NAME); + + + @Parameter(names = {"-d", "--data"}, description = "The file path to the CSV file containing workflow metrics from Terra. The first line of the file should contain the CSV fields: workflow_id,status,workflow_start,workflow_end,workflow_runtime_minutes,source_url", required = true) + private String dataFilePath; + + @Parameter(names = {"-r", "--recordSkipped"}, description = "Record skipped executions and the reason skipped to a CSV file") + private boolean recordSkippedExecutions; + + public File getConfig() { + return config; + } + + + public String getDataFilePath() { + return dataFilePath; + } + + public boolean isRecordSkippedExecutions() { + return recordSkippedExecutions; + } + + /** + * Headers for the input data file + */ + public enum TerraMetricsCsvHeaders { + workflow_id, status, workflow_start, workflow_end, workflow_runtime_minutes, source_url + } + + /** + * Headers for the output file containing workflow executions that were skipped. + * The headers are the same as the input file headers, with the addition of a "reason" header indicating why an execution was skipped + */ + public enum SkippedTerraMetricsCsvHeaders { + workflow_id, status, workflow_start, workflow_end, workflow_runtime_minutes, source_url, reason_skipped + } + } } diff --git a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/MetricsAggregatorClient.java b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/MetricsAggregatorClient.java index 49036119..6c7e4c06 100644 --- a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/MetricsAggregatorClient.java +++ b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/MetricsAggregatorClient.java @@ -19,6 +19,7 @@ import static io.dockstore.utils.CLIConstants.FAILURE_EXIT_CODE; import static io.dockstore.utils.ConfigFileUtils.getConfiguration; +import static io.dockstore.utils.DockstoreApiClientUtils.setupApiClient; import com.beust.jcommander.JCommander; import com.beust.jcommander.MissingCommandException; @@ -27,9 +28,9 @@ import io.dockstore.metricsaggregator.MetricsAggregatorConfig; import io.dockstore.metricsaggregator.MetricsAggregatorS3Client; import io.dockstore.metricsaggregator.client.cli.CommandLineArgs.AggregateMetricsCommand; +import io.dockstore.metricsaggregator.client.cli.CommandLineArgs.SubmitTerraMetrics; import io.dockstore.metricsaggregator.client.cli.CommandLineArgs.SubmitValidationData; import io.dockstore.openapi.client.ApiClient; -import io.dockstore.openapi.client.Configuration; import io.dockstore.openapi.client.api.ExtendedGa4GhApi; import io.dockstore.openapi.client.model.ExecutionsRequestBody; import io.dockstore.openapi.client.model.ValidationExecution; @@ -38,6 +39,8 @@ import java.io.IOException; import java.net.URISyntaxException; import java.nio.file.Files; +import java.time.Duration; +import java.time.Instant; import java.util.List; import java.util.Optional; import org.apache.commons.configuration2.INIConfiguration; @@ -56,19 +59,22 @@ public class MetricsAggregatorClient { private static final Logger LOG = LoggerFactory.getLogger(MetricsAggregatorClient.class); - public MetricsAggregatorClient() { } public static void main(String[] args) { + final Instant startTime = Instant.now(); MetricsAggregatorClient metricsAggregatorClient = new MetricsAggregatorClient(); final CommandLineArgs commandLineArgs = new CommandLineArgs(); final JCommander jCommander = new JCommander(commandLineArgs); final AggregateMetricsCommand aggregateMetricsCommand = new AggregateMetricsCommand(); final SubmitValidationData submitValidationData = new SubmitValidationData(); + final SubmitTerraMetrics submitTerraMetrics = new SubmitTerraMetrics(); + jCommander.addCommand(aggregateMetricsCommand); jCommander.addCommand(submitValidationData); + jCommander.addCommand(submitTerraMetrics); try { jCommander.parse(args); @@ -117,20 +123,35 @@ public static void main(String[] args) { try { final MetricsAggregatorConfig metricsAggregatorConfig = new MetricsAggregatorConfig(config.get()); metricsAggregatorClient.submitValidationData(metricsAggregatorConfig, submitValidationData.getValidator(), - submitValidationData.getValidatorVersion(), submitValidationData.getDataFilePath(), submitValidationData.getPlatform()); + submitValidationData.getValidatorVersion(), submitValidationData.getDataFilePath(), + submitValidationData.getPlatform()); } catch (Exception e) { LOG.error("Could not submit validation metrics to Dockstore", e); System.exit(FAILURE_EXIT_CODE); } } - } - } + } else if ("submit-terra-metrics".equals(jCommander.getParsedCommand())) { + if (submitTerraMetrics.isHelp()) { + jCommander.usage(); + } else { + final Optional config = getConfiguration(submitTerraMetrics.getConfig()); + if (config.isEmpty()) { + System.exit(FAILURE_EXIT_CODE); + } - private ApiClient setupApiClient(String serverUrl, String token) { - ApiClient apiClient = Configuration.getDefaultApiClient(); - apiClient.setBasePath(serverUrl); - apiClient.addDefaultHeader("Authorization", "Bearer " + token); - return apiClient; + try { + final MetricsAggregatorConfig metricsAggregatorConfig = new MetricsAggregatorConfig(config.get()); + final TerraMetricsSubmitter submitTerraMetricsCommand = new TerraMetricsSubmitter(metricsAggregatorConfig, + submitTerraMetrics); + submitTerraMetricsCommand.submitMetrics(); + } catch (Exception e) { + LOG.error("Could not submit terra metrics to Dockstore", e); + System.exit(FAILURE_EXIT_CODE); + } + } + } + final Instant endTime = Instant.now(); + LOG.info("{} took {}", jCommander.getParsedCommand(), Duration.between(startTime, endTime)); } private void aggregateMetrics(MetricsAggregatorConfig config) throws URISyntaxException { @@ -147,8 +168,8 @@ private void aggregateMetrics(MetricsAggregatorConfig config) throws URISyntaxEx metricsAggregatorS3Client.aggregateMetrics(extendedGa4GhApi); } - - private void submitValidationData(MetricsAggregatorConfig config, ValidatorToolEnum validator, String validatorVersion, String dataFilePath, Partner platform) throws IOException { + private void submitValidationData(MetricsAggregatorConfig config, ValidatorToolEnum validator, String validatorVersion, + String dataFilePath, Partner platform) throws IOException { ApiClient apiClient = setupApiClient(config.getDockstoreServerUrl(), config.getDockstoreToken()); ExtendedGa4GhApi extendedGa4GhApi = new ExtendedGa4GhApi(apiClient); @@ -179,17 +200,16 @@ private void submitValidationData(MetricsAggregatorConfig config, ValidatorToolE continue; } String dateExecuted = lineComponents[DATE_EXECUTED_INDEX]; - ValidationExecution validationExecution = new ValidationExecution() - .validatorTool(validator) - .validatorToolVersion(validatorVersion) - .isValid(isValid); + ValidationExecution validationExecution = new ValidationExecution().validatorTool(validator) + .validatorToolVersion(validatorVersion).isValid(isValid); validationExecution.setDateExecuted(dateExecuted); ExecutionsRequestBody executionsRequestBody = new ExecutionsRequestBody().validationExecutions(List.of(validationExecution)); try { extendedGa4GhApi.executionMetricsPost(executionsRequestBody, platform.toString(), trsId, versionName, "Validation executions submitted using dockstore-support metricsaggregator"); - System.out.printf("Submitted validation metrics for tool ID %s, version %s, %s validated by %s %s on platform %s%n", trsId, versionName, isValid ? "successfully" : "unsuccessfully", validator, validatorVersion, platform); + System.out.printf("Submitted validation metrics for tool ID %s, version %s, %s validated by %s %s on platform %s%n", trsId, + versionName, isValid ? "successfully" : "unsuccessfully", validator, validatorVersion, platform); } catch (Exception e) { // Could end up here if the workflow no longer exists. Log then continue processing LOG.error("Could not submit validation executions to Dockstore for workflow {}", csvLine, e); diff --git a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java new file mode 100644 index 00000000..53012496 --- /dev/null +++ b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java @@ -0,0 +1,326 @@ +package io.dockstore.metricsaggregator.client.cli; + +import static io.dockstore.utils.CLIConstants.FAILURE_EXIT_CODE; +import static io.dockstore.utils.DockstoreApiClientUtils.setupApiClient; + +import io.dockstore.common.Partner; +import io.dockstore.metricsaggregator.MetricsAggregatorConfig; +import io.dockstore.metricsaggregator.client.cli.CommandLineArgs.SubmitTerraMetrics; +import io.dockstore.metricsaggregator.client.cli.CommandLineArgs.SubmitTerraMetrics.SkippedTerraMetricsCsvHeaders; +import io.dockstore.metricsaggregator.client.cli.CommandLineArgs.SubmitTerraMetrics.TerraMetricsCsvHeaders; +import io.dockstore.openapi.client.ApiClient; +import io.dockstore.openapi.client.ApiException; +import io.dockstore.openapi.client.api.ExtendedGa4GhApi; +import io.dockstore.openapi.client.api.WorkflowsApi; +import io.dockstore.openapi.client.model.ExecutionsRequestBody; +import io.dockstore.openapi.client.model.RunExecution; +import io.dockstore.openapi.client.model.RunExecution.ExecutionStatusEnum; +import io.dockstore.openapi.client.model.Workflow.DescriptorTypeEnum; +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.time.Instant; +import java.time.LocalDateTime; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; +import java.time.format.DateTimeParseException; +import java.time.temporal.ChronoField; +import java.time.temporal.ChronoUnit; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVPrinter; +import org.apache.commons.csv.CSVRecord; +import org.apache.commons.lang3.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class TerraMetricsSubmitter { + private static final Logger LOG = LoggerFactory.getLogger(MetricsAggregatorClient.class); + private final MetricsAggregatorConfig config; + private final SubmitTerraMetrics submitTerraMetricsCommand; + private int numberOfExecutionsSubmitted = 0; + private int numberOfExecutionsSkipped = 0; + private int numberOfCacheHits = 0; + private int numberOfCacheMisses = 0; + // Keep track of sourceUrls that are found to be ambiguous + private Map sourceUrlsToSkipToReason = new HashMap<>(); + // Map of source url to TRS info that was previously calculated + private Map sourceUrlToSourceUrlTrsInfo = new HashMap<>(); + private Map> workflowPathPrefixToWorkflows = new HashMap<>(); + + + public TerraMetricsSubmitter(MetricsAggregatorConfig config, SubmitTerraMetrics submitTerraMetricsCommand) { + this.config = config; + this.submitTerraMetricsCommand = submitTerraMetricsCommand; + } + + public void submitMetrics() { + ApiClient apiClient = setupApiClient(config.getDockstoreServerUrl(), config.getDockstoreToken()); + ExtendedGa4GhApi extendedGa4GhApi = new ExtendedGa4GhApi(apiClient); + WorkflowsApi workflowsApi = new WorkflowsApi(apiClient); + + // Read CSV file + Iterable workflowMetricRecords = null; + final String inputDateFilePath = this.submitTerraMetricsCommand.getDataFilePath(); + try (BufferedReader metricsBufferedReader = new BufferedReader(new FileReader(inputDateFilePath))) { + //final Reader entriesCsv = new BufferedReader(new FileReader(inputDateFilePath)); + final CSVFormat csvFormat = CSVFormat.DEFAULT.builder() + .setHeader(TerraMetricsCsvHeaders.class) + .setSkipHeaderRecord(true) + .setTrim(true) + .build(); + workflowMetricRecords = csvFormat.parse(metricsBufferedReader); + + final String outputFileName = inputDateFilePath + "_skipped_executions_" + Instant.now().truncatedTo(ChronoUnit.SECONDS).toString().replace("-", "").replace(":", "") + ".csv"; + + try (CSVPrinter skippedExecutionsCsvPrinter = submitTerraMetricsCommand.isRecordSkippedExecutions() ? new CSVPrinter(new FileWriter(outputFileName, StandardCharsets.UTF_8), CSVFormat.DEFAULT.builder().setHeader(SkippedTerraMetricsCsvHeaders.class).build()) : null) { + for (CSVRecord workflowMetricRecord: workflowMetricRecords) { + final String sourceUrl = workflowMetricRecord.get(TerraMetricsCsvHeaders.source_url); + LOG.info("Processing execution on row {} with source_url {}", workflowMetricRecord.getRecordNumber(), sourceUrl); + + // Check to see if this source_url was skipped before + if (sourceUrlsToSkipToReason.containsKey(sourceUrl)) { + skipExecution(sourceUrl, workflowMetricRecord, sourceUrlsToSkipToReason.get(sourceUrl), skippedExecutionsCsvPrinter); + continue; + } + + // Check to see if we need to figure out the TRS ID for the source URL + if (sourceUrlToSourceUrlTrsInfo.containsKey(sourceUrl)) { + ++numberOfCacheHits; + } else { + ++numberOfCacheMisses; + Optional sourceUrlTrsInfo = calculateTrsInfoFromSourceUrl(workflowMetricRecord, sourceUrl, workflowsApi, skippedExecutionsCsvPrinter); + if (sourceUrlTrsInfo.isEmpty()) { + continue; + } else { + sourceUrlToSourceUrlTrsInfo.put(sourceUrl, sourceUrlTrsInfo.get()); + } + } + + final Optional workflowExecution = getTerraWorkflowExecutionFromCsvRecord(workflowMetricRecord, sourceUrl, skippedExecutionsCsvPrinter); + if (workflowExecution.isEmpty()) { + continue; + } + + final SourceUrlTrsInfo sourceUrlTrsInfo = sourceUrlToSourceUrlTrsInfo.get(sourceUrl); + final ExecutionsRequestBody executionsRequestBody = new ExecutionsRequestBody().runExecutions(List.of(workflowExecution.get())); + try { + extendedGa4GhApi.executionMetricsPost(executionsRequestBody, Partner.TERRA.toString(), sourceUrlTrsInfo.trsId(), sourceUrlTrsInfo.version(), + "Terra metrics from BigQuery table broad-dsde-prod-analytics-dev.externally_shared_metrics.dockstore_workflow_metrics_Q4_2023"); + ++numberOfExecutionsSubmitted; + } catch (ApiException e) { + skipExecution(sourceUrl, workflowMetricRecord, String.format("Could not submit execution metrics to Dockstore for workflow %s: %s", sourceUrlTrsInfo, e.getMessage()), skippedExecutionsCsvPrinter); + } + } + } catch (IOException e) { + LOG.error("Unable to create new CSV output file", e); + System.exit(FAILURE_EXIT_CODE); + } + + LOG.info("Done submitting executions from Terra. Submitted {} executions. Skipped {} executions. Cache hits: {}. Cache misses: {}", numberOfExecutionsSubmitted, numberOfExecutionsSkipped, numberOfCacheHits, numberOfCacheMisses); + if (submitTerraMetricsCommand.isRecordSkippedExecutions()) { + LOG.info("View skipped executions in file {}", outputFileName); + } + } catch (IOException e) { + LOG.error("Unable to read input CSV file", e); + System.exit(FAILURE_EXIT_CODE); + } + } + + static Optional getExecutionStatusEnumFromTerraStatus(String terraStatus) { + ExecutionStatusEnum executionStatusEnum = switch (terraStatus) { + case "Succeeded" -> ExecutionStatusEnum.SUCCESSFUL; + case "Failed" -> ExecutionStatusEnum.FAILED; + case "Aborted" -> ExecutionStatusEnum.ABORTED; + default -> null; + }; + return Optional.ofNullable(executionStatusEnum); + } + + static Optional formatStringInIso8601Date(String workflowStart) { + // Example workflow_start: 2022-07-15 15:37:06.450000 UTC + final int maxNumberOfMicroSeconds = 9; + final DateTimeFormatter workflowStartFormat = new DateTimeFormatterBuilder() + .appendPattern("yyyy-MM-dd HH:mm:ss") + .appendFraction(ChronoField.MICRO_OF_SECOND, 0, maxNumberOfMicroSeconds, true) // There are varying widths of micro seconds + .optionalStart() // Start optional time zone pattern. Time zone is not always included + .appendPattern(" ") + .appendZoneId() + .optionalEnd() // End optional time zone pattern + .toFormatter(); + + try { + final LocalDateTime localDateTime = LocalDateTime.parse(workflowStart, workflowStartFormat); + return Optional.of(DateTimeFormatter.ISO_INSTANT.format(localDateTime.atOffset(ZoneOffset.UTC))); + } catch (DateTimeParseException e) { + return Optional.empty(); + } + } + + private Optional getPrimaryDescriptorAbsolutePath(WorkflowsApi workflowsApi, MinimalWorkflowInfo workflow, String version) { + try { + return Optional.of(workflowsApi.primaryDescriptor1(workflow.id(), version, workflow.descriptorType().toString()).getAbsolutePath()); + } catch (ApiException e) { + return Optional.empty(); + } + } + + + private List getSourceUrlComponents(String sourceUrl) { + final String rawGitHubUserContentUrlPrefix = "https://raw.githubusercontent.com/"; // The source_url starts with this + final String sourceUrlWithoutGitHubPrefix = sourceUrl.replace(rawGitHubUserContentUrlPrefix, ""); + return Arrays.stream(sourceUrlWithoutGitHubPrefix.split("/")) + .filter(urlComponent -> !urlComponent.isEmpty()) // Filter out empty strings that are a result of consecutive slashes '//' + .toList(); + } + + private void skipExecution(String sourceUrl, CSVRecord csvRecordToSkip, String reason, CSVPrinter skippedExecutionsCsvPrinter) { + LOG.info("Skipping execution on row {}: {}", csvRecordToSkip.getRecordNumber(), reason); + // Record to map for future reference + sourceUrlsToSkipToReason.put(sourceUrl, reason); + if (submitTerraMetricsCommand.isRecordSkippedExecutions()) { + // Record to output CSV file for later examination + // Headers: workflow_id, status, workflow_start, workflow_end, workflow_runtime_minutes, source_url + List csvColumnValues = Arrays.stream(TerraMetricsCsvHeaders.values()) + .map(csvRecordToSkip::get) // Get the column value for the record + .collect(Collectors.toList()); + csvColumnValues.add(reason); + try { + skippedExecutionsCsvPrinter.printRecord(csvColumnValues); + } catch (IOException e) { + LOG.error("Could not write skipped execution to output file"); + } + } + ++numberOfExecutionsSkipped; + } + + public Optional getTerraWorkflowExecutionFromCsvRecord(CSVRecord csvRecord, String sourceUrl, CSVPrinter skippedExecutionsCsvPrinter) { + final String executionId = csvRecord.get(TerraMetricsCsvHeaders.workflow_id); + final String workflowStart = csvRecord.get(TerraMetricsCsvHeaders.workflow_start); + final String status = csvRecord.get(TerraMetricsCsvHeaders.status); + final String workflowRunTimeMinutes = csvRecord.get(TerraMetricsCsvHeaders.workflow_runtime_minutes); + + // Check that all required fields are non-null + if (executionId == null || workflowStart == null || status == null) { + skipExecution(sourceUrl, csvRecord, "One or more of the required fields (workflow_id, workflow_start, status) is missing", skippedExecutionsCsvPrinter); + return Optional.empty(); + } + + // Format fields into Dockstore schema + final Optional executionStatus = getExecutionStatusEnumFromTerraStatus(status); + if (executionStatus.isEmpty()) { + skipExecution(sourceUrl, csvRecord, "Could not get a valid ExecutionStatusEnum from status '" + status + "'", skippedExecutionsCsvPrinter); + return Optional.empty(); + } + + final Optional dateExecuted = formatStringInIso8601Date(workflowStart); + if (dateExecuted.isEmpty()) { + skipExecution(sourceUrl, csvRecord, "Could not get a valid dateExecuted from workflow_start '" + workflowStart + "'", skippedExecutionsCsvPrinter); + return Optional.empty(); + } + + RunExecution workflowExecution = new RunExecution(); + // TODO: uncomment below when the update executions endpoint PR is merged + // workflowExecution.setExecutionId(executionId); + workflowExecution.setExecutionStatus(executionStatus.get()); + workflowExecution.setDateExecuted(dateExecuted.get()); + getExecutionTime(workflowRunTimeMinutes).ifPresent(workflowExecution::setExecutionTime); + return Optional.of(workflowExecution); + } + + static Optional getExecutionTime(String workflowRunTimeMinutes) { + if (StringUtils.isBlank(workflowRunTimeMinutes)) { + return Optional.empty(); + } + return Optional.of(String.format("PT%sM", workflowRunTimeMinutes)); + } + + private Optional calculateTrsInfoFromSourceUrl(CSVRecord workflowMetricRecord, String sourceUrl, WorkflowsApi workflowsApi, CSVPrinter skippedExecutionsCsvPrinter) { + // Need to figure out the TRS ID and version name using the source_url. + // Example source_url: https://raw.githubusercontent.com/theiagen/public_health_viral_genomics/v2.0.0/workflows/wf_theiacov_fasta.wdl + // Organization = "theiagen/public_health_viral_genomics", version = "v2.0.0", the rest is the primary descriptor path + // Note that the TRS ID may also have a workflow name, which we need to figure out + final List sourceUrlComponents = getSourceUrlComponents(sourceUrl); + + // There should be at least three elements in order for there to be an organization name, foo>/, and version + // in /// + final int minNumberOfComponents = 3; + if (sourceUrlComponents.size() >= minNumberOfComponents) { + final String organization = sourceUrlComponents.get(0) + "/" + sourceUrlComponents.get(1); + final String version = sourceUrlComponents.get(2); + final String primaryDescriptorPathFromUrl = "/" + String.join("/", sourceUrlComponents.subList(3, sourceUrlComponents.size())); + + final String workflowPathPrefix = "github.com/" + organization; + if (!workflowPathPrefixToWorkflows.containsKey(workflowPathPrefix)) { + try { + List publishedWorkflowsWithSamePathPrefix = workflowsApi.getAllPublishedWorkflowByPath(workflowPathPrefix).stream() + .map(workflow -> new MinimalWorkflowInfo(workflow.getId(), workflow.getFullWorkflowPath(), workflow.getDescriptorType(), new HashMap<>())) + .toList(); + workflowPathPrefixToWorkflows.put(workflowPathPrefix, publishedWorkflowsWithSamePathPrefix); + } catch (ApiException e) { + skipExecution(sourceUrl, workflowMetricRecord, + "Could not get all published workflows for workflow path " + workflowPathPrefix + " to determine TRS ID", + skippedExecutionsCsvPrinter); + return Optional.empty(); + } + } + + List workflowsFromSameRepo = workflowPathPrefixToWorkflows.get(workflowPathPrefix); + + List foundFullWorkflowPaths = new ArrayList<>(); + workflowsFromSameRepo.forEach(workflow -> { + if (!workflow.versionToPrimaryDescriptorPath().containsKey(version)) { + // Intentionally putting null in map to indicate that the primary descriptor path for the workflow version doesn't exist + workflow.versionToPrimaryDescriptorPath().put(version, getPrimaryDescriptorAbsolutePath(workflowsApi, workflow, version).orElse(null)); + } + + final String primaryDescriptorPathForVersion = workflow.versionToPrimaryDescriptorPath().get(version); + if (primaryDescriptorPathFromUrl.equals(primaryDescriptorPathForVersion)) { + // Collect a list of workflow paths that have the primary descriptor path we're looking for + foundFullWorkflowPaths.add(workflow.fullWorkflowPath()); + } + }); + + + if (foundFullWorkflowPaths.isEmpty()) { + skipExecution(sourceUrl, workflowMetricRecord, "Could not find workflow with primary descriptor " + primaryDescriptorPathFromUrl, skippedExecutionsCsvPrinter); + return Optional.empty(); + } else if (foundFullWorkflowPaths.size() > 1) { + // There is already a workflow in the same repository with the same descriptor path that we're looking for. + // Skip this source url because it is an ambiguous case and we can't identify which workflow the source url is referring to. + skipExecution(sourceUrl, workflowMetricRecord, "There's more than one workflow in the repository with the same primary descriptor path", skippedExecutionsCsvPrinter); + return Optional.empty(); + } else { + final SourceUrlTrsInfo sourceUrlTrsInfo = new SourceUrlTrsInfo(sourceUrl, "#workflow/" + foundFullWorkflowPaths.get(0), version); + return Optional.of(sourceUrlTrsInfo); + } + } else { + skipExecution(sourceUrl, workflowMetricRecord, "Not enough components in the source_url to figure out the TRS ID and version", skippedExecutionsCsvPrinter); + return Optional.empty(); + } + } + + /** + * Record that stores the calculated TRS ID and version, derived from the source_url + * @param sourceUrl + * @param trsId + * @param version + */ + public record SourceUrlTrsInfo(String sourceUrl, String trsId, String version) { + } + + public record MinimalWorkflowInfo(long id, String fullWorkflowPath, DescriptorTypeEnum descriptorType, Map versionToPrimaryDescriptorPath) { + } + + public record WorkflowsFromSameOrganizationInfo(String workflowPathPrefix, String organization, Map primaryDescriptorPathToFullWorkflowPath) { + } +} diff --git a/metricsaggregator/src/main/resources/logback.xml b/metricsaggregator/src/main/resources/logback.xml index ef430a56..3f2d6d36 100644 --- a/metricsaggregator/src/main/resources/logback.xml +++ b/metricsaggregator/src/main/resources/logback.xml @@ -23,7 +23,7 @@ - + diff --git a/metricsaggregator/src/test/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitterTest.java b/metricsaggregator/src/test/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitterTest.java new file mode 100644 index 00000000..8b038362 --- /dev/null +++ b/metricsaggregator/src/test/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitterTest.java @@ -0,0 +1,40 @@ +package io.dockstore.metricsaggregator.client.cli; + +import static io.dockstore.metricsaggregator.client.cli.TerraMetricsSubmitter.formatStringInIso8601Date; +import static io.dockstore.metricsaggregator.client.cli.TerraMetricsSubmitter.getExecutionTime; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import io.dockstore.openapi.client.model.RunExecution.ExecutionStatusEnum; +import org.junit.jupiter.api.Test; + +class TerraMetricsSubmitterTest { + + @Test + void testFormatStringInIso8601Date() { + assertEquals("2022-07-15T15:37:06.123456Z", formatStringInIso8601Date("2022-07-15 15:37:06.123456 UTC").get()); + assertEquals("2022-07-15T15:37:06.450Z", formatStringInIso8601Date("2022-07-15 15:37:06.450000 UTC").get()); + assertEquals("2022-06-30T00:33:44.101Z", formatStringInIso8601Date("2022-06-30 00:33:44.101000 UTC").get()); + assertEquals("2022-09-06T13:46:53Z", formatStringInIso8601Date("2022-09-06 13:46:53").get()); + assertEquals("2022-05-25T13:13:08.510Z", formatStringInIso8601Date("2022-05-25 13:13:08.51 UTC").get()); + assertEquals("2022-12-01T01:52:05.700Z", formatStringInIso8601Date("2022-12-01 01:52:05.7 UTC").get()); + } + + @Test + void testGetExecutionStatusEnumFromTerraStatus() { + assertEquals(ExecutionStatusEnum.SUCCESSFUL, TerraMetricsSubmitter.getExecutionStatusEnumFromTerraStatus("Succeeded").get()); + assertEquals(ExecutionStatusEnum.FAILED, TerraMetricsSubmitter.getExecutionStatusEnumFromTerraStatus("Failed").get()); + assertEquals(ExecutionStatusEnum.ABORTED, TerraMetricsSubmitter.getExecutionStatusEnumFromTerraStatus("Aborted").get()); + // These two statuses are present in the workflow executions Terra provided, but they don't represent a completed execution + assertTrue(TerraMetricsSubmitter.getExecutionStatusEnumFromTerraStatus("Aborting").isEmpty()); + assertTrue(TerraMetricsSubmitter.getExecutionStatusEnumFromTerraStatus("Running").isEmpty()); + } + + @Test + void testGetExecutionTime() { + assertEquals("PT100M", getExecutionTime("100").get()); + assertTrue(getExecutionTime("").isEmpty()); + assertTrue(getExecutionTime(" ").isEmpty()); + assertTrue(getExecutionTime(null).isEmpty()); + } +} diff --git a/pom.xml b/pom.xml index 4538516f..5ada9db7 100644 --- a/pom.xml +++ b/pom.xml @@ -38,7 +38,7 @@ scm:git:git@github.com:dockstore/dockstore-support.git UTF-8 - 1.15.0-alpha.13 + 1.15.0-SNAPSHOT 3.0.0-M5 2.22.2 false @@ -162,6 +162,11 @@ jersey-hk2 3.0.11 + + org.apache.commons + commons-csv + 1.10.0 + @@ -239,10 +244,6 @@ true - - - **/logback.xml - diff --git a/topicgenerator/pom.xml b/topicgenerator/pom.xml index 64ee2aa7..95ab3576 100644 --- a/topicgenerator/pom.xml +++ b/topicgenerator/pom.xml @@ -129,7 +129,6 @@ org.apache.commons commons-csv - 1.10.0 diff --git a/utils/pom.xml b/utils/pom.xml index 9157a180..e6439bb5 100644 --- a/utils/pom.xml +++ b/utils/pom.xml @@ -89,6 +89,11 @@ commons-beanutils commons-beanutils + + io.dockstore + openapi-java-client + ${dockstore-core.version} + diff --git a/utils/src/main/java/io/dockstore/utils/DockstoreApiClientUtils.java b/utils/src/main/java/io/dockstore/utils/DockstoreApiClientUtils.java new file mode 100644 index 00000000..b65d1e99 --- /dev/null +++ b/utils/src/main/java/io/dockstore/utils/DockstoreApiClientUtils.java @@ -0,0 +1,16 @@ +package io.dockstore.utils; + +import io.dockstore.openapi.client.ApiClient; +import io.dockstore.openapi.client.Configuration; + +public final class DockstoreApiClientUtils { + private DockstoreApiClientUtils() { + } + + public static ApiClient setupApiClient(String serverUrl, String token) { + ApiClient apiClient = Configuration.getDefaultApiClient(); + apiClient.setBasePath(serverUrl); + apiClient.addDefaultHeader("Authorization", "Bearer " + token); + return apiClient; + } +} From 8bf546da75325622e1548eae59db9927f38a7a99 Mon Sep 17 00:00:00 2001 From: Kathy Tran Date: Tue, 12 Dec 2023 11:29:34 -0500 Subject: [PATCH 02/19] Use concurrent types --- .../client/cli/TerraMetricsSubmitter.java | 115 ++++++++++-------- 1 file changed, 66 insertions(+), 49 deletions(-) diff --git a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java index 53012496..aa2d468c 100644 --- a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java +++ b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java @@ -35,6 +35,8 @@ import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVPrinter; @@ -47,16 +49,16 @@ public class TerraMetricsSubmitter { private static final Logger LOG = LoggerFactory.getLogger(MetricsAggregatorClient.class); private final MetricsAggregatorConfig config; private final SubmitTerraMetrics submitTerraMetricsCommand; - private int numberOfExecutionsSubmitted = 0; - private int numberOfExecutionsSkipped = 0; - private int numberOfCacheHits = 0; - private int numberOfCacheMisses = 0; + private final AtomicInteger numberOfExecutionsProcessed = new AtomicInteger(0); + private final AtomicInteger numberOfExecutionsSubmitted = new AtomicInteger(0); + private final AtomicInteger numberOfExecutionsSkipped = new AtomicInteger(0); + private final AtomicInteger numberOfCacheHits = new AtomicInteger(0); + private final AtomicInteger numberOfCacheMisses = new AtomicInteger(0); // Keep track of sourceUrls that are found to be ambiguous - private Map sourceUrlsToSkipToReason = new HashMap<>(); + private final Map sourceUrlsToSkipToReason = new ConcurrentHashMap<>(); // Map of source url to TRS info that was previously calculated - private Map sourceUrlToSourceUrlTrsInfo = new HashMap<>(); - private Map> workflowPathPrefixToWorkflows = new HashMap<>(); - + private final Map sourceUrlToSourceUrlTrsInfo = new ConcurrentHashMap<>(); + private final Map> workflowPathPrefixToWorkflows = new ConcurrentHashMap<>(); public TerraMetricsSubmitter(MetricsAggregatorConfig config, SubmitTerraMetrics submitTerraMetricsCommand) { this.config = config; @@ -80,45 +82,21 @@ public void submitMetrics() { .build(); workflowMetricRecords = csvFormat.parse(metricsBufferedReader); - final String outputFileName = inputDateFilePath + "_skipped_executions_" + Instant.now().truncatedTo(ChronoUnit.SECONDS).toString().replace("-", "").replace(":", "") + ".csv"; + final String outputFileName = inputDateFilePath + "_skipped_executions_" + Instant.now().truncatedTo(ChronoUnit.SECONDS).toString().replace("-", "").replace(":", "") + ".csv"; + List workflowMetricsToProcess = new ArrayList<>(); try (CSVPrinter skippedExecutionsCsvPrinter = submitTerraMetricsCommand.isRecordSkippedExecutions() ? new CSVPrinter(new FileWriter(outputFileName, StandardCharsets.UTF_8), CSVFormat.DEFAULT.builder().setHeader(SkippedTerraMetricsCsvHeaders.class).build()) : null) { for (CSVRecord workflowMetricRecord: workflowMetricRecords) { - final String sourceUrl = workflowMetricRecord.get(TerraMetricsCsvHeaders.source_url); - LOG.info("Processing execution on row {} with source_url {}", workflowMetricRecord.getRecordNumber(), sourceUrl); - - // Check to see if this source_url was skipped before - if (sourceUrlsToSkipToReason.containsKey(sourceUrl)) { - skipExecution(sourceUrl, workflowMetricRecord, sourceUrlsToSkipToReason.get(sourceUrl), skippedExecutionsCsvPrinter); - continue; - } - - // Check to see if we need to figure out the TRS ID for the source URL - if (sourceUrlToSourceUrlTrsInfo.containsKey(sourceUrl)) { - ++numberOfCacheHits; + final int batchSize = 100000; + if (workflowMetricsToProcess.size() < batchSize && workflowMetricRecords.iterator().hasNext()) { + workflowMetricsToProcess.add(workflowMetricRecord); } else { - ++numberOfCacheMisses; - Optional sourceUrlTrsInfo = calculateTrsInfoFromSourceUrl(workflowMetricRecord, sourceUrl, workflowsApi, skippedExecutionsCsvPrinter); - if (sourceUrlTrsInfo.isEmpty()) { - continue; - } else { - sourceUrlToSourceUrlTrsInfo.put(sourceUrl, sourceUrlTrsInfo.get()); - } - } - - final Optional workflowExecution = getTerraWorkflowExecutionFromCsvRecord(workflowMetricRecord, sourceUrl, skippedExecutionsCsvPrinter); - if (workflowExecution.isEmpty()) { - continue; - } - - final SourceUrlTrsInfo sourceUrlTrsInfo = sourceUrlToSourceUrlTrsInfo.get(sourceUrl); - final ExecutionsRequestBody executionsRequestBody = new ExecutionsRequestBody().runExecutions(List.of(workflowExecution.get())); - try { - extendedGa4GhApi.executionMetricsPost(executionsRequestBody, Partner.TERRA.toString(), sourceUrlTrsInfo.trsId(), sourceUrlTrsInfo.version(), - "Terra metrics from BigQuery table broad-dsde-prod-analytics-dev.externally_shared_metrics.dockstore_workflow_metrics_Q4_2023"); - ++numberOfExecutionsSubmitted; - } catch (ApiException e) { - skipExecution(sourceUrl, workflowMetricRecord, String.format("Could not submit execution metrics to Dockstore for workflow %s: %s", sourceUrlTrsInfo, e.getMessage()), skippedExecutionsCsvPrinter); + workflowMetricsToProcess.add(workflowMetricRecord); + LOG.info("Processing {} rows", workflowMetricsToProcess.size()); + workflowMetricsToProcess.stream() + .parallel() + .forEach(record -> processWorkflowExecution(record, workflowsApi, extendedGa4GhApi, skippedExecutionsCsvPrinter)); + workflowMetricsToProcess.clear(); } } } catch (IOException e) { @@ -126,7 +104,7 @@ public void submitMetrics() { System.exit(FAILURE_EXIT_CODE); } - LOG.info("Done submitting executions from Terra. Submitted {} executions. Skipped {} executions. Cache hits: {}. Cache misses: {}", numberOfExecutionsSubmitted, numberOfExecutionsSkipped, numberOfCacheHits, numberOfCacheMisses); + LOG.info("Done processing {} executions from Terra. Submitted {} executions. Skipped {} executions. Cache hits: {}. Cache misses: {}", numberOfExecutionsProcessed, numberOfExecutionsSubmitted, numberOfExecutionsSkipped, numberOfCacheHits, numberOfCacheMisses); if (submitTerraMetricsCommand.isRecordSkippedExecutions()) { LOG.info("View skipped executions in file {}", outputFileName); } @@ -136,6 +114,46 @@ public void submitMetrics() { } } + private void processWorkflowExecution(CSVRecord workflowMetricRecord, WorkflowsApi workflowsApi, ExtendedGa4GhApi extendedGa4GhApi, CSVPrinter skippedExecutionsCsvPrinter) { + final String sourceUrl = workflowMetricRecord.get(TerraMetricsCsvHeaders.source_url); + LOG.info("Processing execution on row {} with source_url {}", workflowMetricRecord.getRecordNumber(), sourceUrl); + numberOfExecutionsProcessed.incrementAndGet(); + + // Check to see if this source_url was skipped before + if (sourceUrlsToSkipToReason.containsKey(sourceUrl)) { + skipExecution(sourceUrl, workflowMetricRecord, sourceUrlsToSkipToReason.get(sourceUrl), skippedExecutionsCsvPrinter); + return; + } + + // Check to see if we need to figure out the TRS ID for the source URL + if (sourceUrlToSourceUrlTrsInfo.containsKey(sourceUrl)) { + numberOfCacheHits.incrementAndGet(); + } else { + numberOfCacheMisses.incrementAndGet(); + Optional sourceUrlTrsInfo = calculateTrsInfoFromSourceUrl(workflowMetricRecord, sourceUrl, workflowsApi, skippedExecutionsCsvPrinter); + if (sourceUrlTrsInfo.isEmpty()) { + return; + } else { + sourceUrlToSourceUrlTrsInfo.put(sourceUrl, sourceUrlTrsInfo.get()); + } + } + + final Optional workflowExecution = getTerraWorkflowExecutionFromCsvRecord(workflowMetricRecord, sourceUrl, skippedExecutionsCsvPrinter); + if (workflowExecution.isEmpty()) { + return; + } + + final SourceUrlTrsInfo sourceUrlTrsInfo = sourceUrlToSourceUrlTrsInfo.get(sourceUrl); + final ExecutionsRequestBody executionsRequestBody = new ExecutionsRequestBody().runExecutions(List.of(workflowExecution.get())); + try { + extendedGa4GhApi.executionMetricsPost(executionsRequestBody, Partner.TERRA.toString(), sourceUrlTrsInfo.trsId(), sourceUrlTrsInfo.version(), + "Terra metrics from BigQuery table broad-dsde-prod-analytics-dev.externally_shared_metrics.dockstore_workflow_metrics_Q4_2023"); + numberOfExecutionsSubmitted.incrementAndGet(); + } catch (ApiException e) { + skipExecution(sourceUrl, workflowMetricRecord, String.format("Could not submit execution metrics to Dockstore for workflow %s: %s", sourceUrlTrsInfo, e.getMessage()), skippedExecutionsCsvPrinter); + } + } + static Optional getExecutionStatusEnumFromTerraStatus(String terraStatus) { ExecutionStatusEnum executionStatusEnum = switch (terraStatus) { case "Succeeded" -> ExecutionStatusEnum.SUCCESSFUL; @@ -200,7 +218,7 @@ private void skipExecution(String sourceUrl, CSVRecord csvRecordToSkip, String r LOG.error("Could not write skipped execution to output file"); } } - ++numberOfExecutionsSkipped; + numberOfExecutionsSkipped.incrementAndGet(); } public Optional getTerraWorkflowExecutionFromCsvRecord(CSVRecord csvRecord, String sourceUrl, CSVPrinter skippedExecutionsCsvPrinter) { @@ -277,6 +295,7 @@ private Optional calculateTrsInfoFromSourceUrl(CSVRecord workf List workflowsFromSameRepo = workflowPathPrefixToWorkflows.get(workflowPathPrefix); List foundFullWorkflowPaths = new ArrayList<>(); + // Loop through each workflow to find one that matches the primary descriptor workflowsFromSameRepo.forEach(workflow -> { if (!workflow.versionToPrimaryDescriptorPath().containsKey(version)) { // Intentionally putting null in map to indicate that the primary descriptor path for the workflow version doesn't exist @@ -297,12 +316,13 @@ private Optional calculateTrsInfoFromSourceUrl(CSVRecord workf } else if (foundFullWorkflowPaths.size() > 1) { // There is already a workflow in the same repository with the same descriptor path that we're looking for. // Skip this source url because it is an ambiguous case and we can't identify which workflow the source url is referring to. - skipExecution(sourceUrl, workflowMetricRecord, "There's more than one workflow in the repository with the same primary descriptor path", skippedExecutionsCsvPrinter); + skipExecution(sourceUrl, workflowMetricRecord, String.format("There's %s workflows in the repository with the same primary descriptor path '%s': %s", foundFullWorkflowPaths.size(), primaryDescriptorPathFromUrl, foundFullWorkflowPaths), skippedExecutionsCsvPrinter); return Optional.empty(); } else { final SourceUrlTrsInfo sourceUrlTrsInfo = new SourceUrlTrsInfo(sourceUrl, "#workflow/" + foundFullWorkflowPaths.get(0), version); return Optional.of(sourceUrlTrsInfo); } + } else { skipExecution(sourceUrl, workflowMetricRecord, "Not enough components in the source_url to figure out the TRS ID and version", skippedExecutionsCsvPrinter); return Optional.empty(); @@ -320,7 +340,4 @@ public record SourceUrlTrsInfo(String sourceUrl, String trsId, String version) { public record MinimalWorkflowInfo(long id, String fullWorkflowPath, DescriptorTypeEnum descriptorType, Map versionToPrimaryDescriptorPath) { } - - public record WorkflowsFromSameOrganizationInfo(String workflowPathPrefix, String organization, Map primaryDescriptorPathToFullWorkflowPath) { - } } From 16c575d2802b66d85f5ad8dce88b1d5c9ac3e45c Mon Sep 17 00:00:00 2001 From: Kathy Tran Date: Tue, 12 Dec 2023 16:10:38 -0500 Subject: [PATCH 03/19] Clean up --- .../metricsaggregator/client/cli/TerraMetricsSubmitter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java index aa2d468c..2057cc10 100644 --- a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java +++ b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java @@ -147,7 +147,7 @@ private void processWorkflowExecution(CSVRecord workflowMetricRecord, WorkflowsA final ExecutionsRequestBody executionsRequestBody = new ExecutionsRequestBody().runExecutions(List.of(workflowExecution.get())); try { extendedGa4GhApi.executionMetricsPost(executionsRequestBody, Partner.TERRA.toString(), sourceUrlTrsInfo.trsId(), sourceUrlTrsInfo.version(), - "Terra metrics from BigQuery table broad-dsde-prod-analytics-dev.externally_shared_metrics.dockstore_workflow_metrics_Q4_2023"); + "Terra metrics from Q4 2023"); numberOfExecutionsSubmitted.incrementAndGet(); } catch (ApiException e) { skipExecution(sourceUrl, workflowMetricRecord, String.format("Could not submit execution metrics to Dockstore for workflow %s: %s", sourceUrlTrsInfo, e.getMessage()), skippedExecutionsCsvPrinter); From c0277ac200d074290b22788bd3c8b7669bef3f30 Mon Sep 17 00:00:00 2001 From: Kathy Tran Date: Thu, 14 Dec 2023 13:11:38 -0500 Subject: [PATCH 04/19] Fix bug where executions were skipped when they shouldn't be skipped --- .../client/cli/TerraMetricsSubmitter.java | 133 +++++++++++------- 1 file changed, 85 insertions(+), 48 deletions(-) diff --git a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java index 2057cc10..7490ba4b 100644 --- a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java +++ b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java @@ -21,6 +21,7 @@ import java.io.FileWriter; import java.io.IOException; import java.nio.charset.StandardCharsets; +import java.time.DateTimeException; import java.time.Instant; import java.time.LocalDateTime; import java.time.ZoneOffset; @@ -31,7 +32,6 @@ import java.time.temporal.ChronoUnit; import java.util.ArrayList; import java.util.Arrays; -import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; @@ -47,13 +47,22 @@ public class TerraMetricsSubmitter { private static final Logger LOG = LoggerFactory.getLogger(MetricsAggregatorClient.class); + private static final int MAX_NUMBER_OF_MICRO_SECONDS = 9; + // Example workflow_start: 2022-07-15 15:37:06.450000 UTC + private static final DateTimeFormatter WORKFLOW_START_FORMAT = new DateTimeFormatterBuilder() + .appendPattern("yyyy-MM-dd HH:mm:ss") + .appendFraction(ChronoField.MICRO_OF_SECOND, 0, MAX_NUMBER_OF_MICRO_SECONDS, true) // There are varying widths of micro seconds + .optionalStart() // Start optional time zone pattern. Time zone is not always included + .appendPattern(" ") + .appendZoneId() + .optionalEnd() // End optional time zone pattern + .toFormatter(); private final MetricsAggregatorConfig config; private final SubmitTerraMetrics submitTerraMetricsCommand; private final AtomicInteger numberOfExecutionsProcessed = new AtomicInteger(0); private final AtomicInteger numberOfExecutionsSubmitted = new AtomicInteger(0); private final AtomicInteger numberOfExecutionsSkipped = new AtomicInteger(0); - private final AtomicInteger numberOfCacheHits = new AtomicInteger(0); - private final AtomicInteger numberOfCacheMisses = new AtomicInteger(0); + // Keep track of sourceUrls that are found to be ambiguous private final Map sourceUrlsToSkipToReason = new ConcurrentHashMap<>(); // Map of source url to TRS info that was previously calculated @@ -74,7 +83,6 @@ public void submitMetrics() { Iterable workflowMetricRecords = null; final String inputDateFilePath = this.submitTerraMetricsCommand.getDataFilePath(); try (BufferedReader metricsBufferedReader = new BufferedReader(new FileReader(inputDateFilePath))) { - //final Reader entriesCsv = new BufferedReader(new FileReader(inputDateFilePath)); final CSVFormat csvFormat = CSVFormat.DEFAULT.builder() .setHeader(TerraMetricsCsvHeaders.class) .setSkipHeaderRecord(true) @@ -92,11 +100,13 @@ public void submitMetrics() { workflowMetricsToProcess.add(workflowMetricRecord); } else { workflowMetricsToProcess.add(workflowMetricRecord); - LOG.info("Processing {} rows", workflowMetricsToProcess.size()); + LOG.info("Processing rows {} to {}", workflowMetricsToProcess.get(0).getRecordNumber(), workflowMetricsToProcess.get(workflowMetricsToProcess.size() - 1).getRecordNumber()); + workflowMetricsToProcess.stream() .parallel() .forEach(record -> processWorkflowExecution(record, workflowsApi, extendedGa4GhApi, skippedExecutionsCsvPrinter)); workflowMetricsToProcess.clear(); + logStats(); } } } catch (IOException e) { @@ -104,7 +114,8 @@ public void submitMetrics() { System.exit(FAILURE_EXIT_CODE); } - LOG.info("Done processing {} executions from Terra. Submitted {} executions. Skipped {} executions. Cache hits: {}. Cache misses: {}", numberOfExecutionsProcessed, numberOfExecutionsSubmitted, numberOfExecutionsSkipped, numberOfCacheHits, numberOfCacheMisses); + logStats(); + if (submitTerraMetricsCommand.isRecordSkippedExecutions()) { LOG.info("View skipped executions in file {}", outputFileName); } @@ -114,6 +125,10 @@ public void submitMetrics() { } } + private void logStats() { + LOG.info("New way: Done processing {} executions from Terra. Submitted {} executions. Skipped {} executions.", numberOfExecutionsProcessed, numberOfExecutionsSubmitted, numberOfExecutionsSkipped); + } + private void processWorkflowExecution(CSVRecord workflowMetricRecord, WorkflowsApi workflowsApi, ExtendedGa4GhApi extendedGa4GhApi, CSVPrinter skippedExecutionsCsvPrinter) { final String sourceUrl = workflowMetricRecord.get(TerraMetricsCsvHeaders.source_url); LOG.info("Processing execution on row {} with source_url {}", workflowMetricRecord.getRecordNumber(), sourceUrl); @@ -121,15 +136,17 @@ private void processWorkflowExecution(CSVRecord workflowMetricRecord, WorkflowsA // Check to see if this source_url was skipped before if (sourceUrlsToSkipToReason.containsKey(sourceUrl)) { - skipExecution(sourceUrl, workflowMetricRecord, sourceUrlsToSkipToReason.get(sourceUrl), skippedExecutionsCsvPrinter); + skipExecution(sourceUrl, workflowMetricRecord, sourceUrlsToSkipToReason.get(sourceUrl), skippedExecutionsCsvPrinter, true); return; } // Check to see if we need to figure out the TRS ID for the source URL - if (sourceUrlToSourceUrlTrsInfo.containsKey(sourceUrl)) { - numberOfCacheHits.incrementAndGet(); - } else { - numberOfCacheMisses.incrementAndGet(); + Optional workflowExecution = getTerraWorkflowExecutionFromCsvRecord(workflowMetricRecord, sourceUrl, skippedExecutionsCsvPrinter); + if (workflowExecution.isEmpty()) { + return; // Row is invalid, don't figure out TRS ID + } + + if (!sourceUrlToSourceUrlTrsInfo.containsKey(sourceUrl)) { Optional sourceUrlTrsInfo = calculateTrsInfoFromSourceUrl(workflowMetricRecord, sourceUrl, workflowsApi, skippedExecutionsCsvPrinter); if (sourceUrlTrsInfo.isEmpty()) { return; @@ -138,11 +155,6 @@ private void processWorkflowExecution(CSVRecord workflowMetricRecord, WorkflowsA } } - final Optional workflowExecution = getTerraWorkflowExecutionFromCsvRecord(workflowMetricRecord, sourceUrl, skippedExecutionsCsvPrinter); - if (workflowExecution.isEmpty()) { - return; - } - final SourceUrlTrsInfo sourceUrlTrsInfo = sourceUrlToSourceUrlTrsInfo.get(sourceUrl); final ExecutionsRequestBody executionsRequestBody = new ExecutionsRequestBody().runExecutions(List.of(workflowExecution.get())); try { @@ -150,7 +162,7 @@ private void processWorkflowExecution(CSVRecord workflowMetricRecord, WorkflowsA "Terra metrics from Q4 2023"); numberOfExecutionsSubmitted.incrementAndGet(); } catch (ApiException e) { - skipExecution(sourceUrl, workflowMetricRecord, String.format("Could not submit execution metrics to Dockstore for workflow %s: %s", sourceUrlTrsInfo, e.getMessage()), skippedExecutionsCsvPrinter); + skipExecution(sourceUrl, workflowMetricRecord, String.format("Could not submit execution metrics to Dockstore for workflow %s: %s", sourceUrlTrsInfo, e.getMessage()), skippedExecutionsCsvPrinter, false); } } @@ -164,32 +176,39 @@ static Optional getExecutionStatusEnumFromTerraStatus(Strin return Optional.ofNullable(executionStatusEnum); } - static Optional formatStringInIso8601Date(String workflowStart) { - // Example workflow_start: 2022-07-15 15:37:06.450000 UTC - final int maxNumberOfMicroSeconds = 9; - final DateTimeFormatter workflowStartFormat = new DateTimeFormatterBuilder() - .appendPattern("yyyy-MM-dd HH:mm:ss") - .appendFraction(ChronoField.MICRO_OF_SECOND, 0, maxNumberOfMicroSeconds, true) // There are varying widths of micro seconds - .optionalStart() // Start optional time zone pattern. Time zone is not always included - .appendPattern(" ") - .appendZoneId() - .optionalEnd() // End optional time zone pattern - .toFormatter(); + private Optional getDateExecuted(CSVRecord csvRecord, String workflowStart, String sourceUrl, CSVPrinter skippedExecutionsCsvPrinter) { + try { + return Optional.of(getIso8601Date(workflowStart)); + } catch (DateTimeException e) { + skipExecution(sourceUrl, csvRecord, String.format("Could not get valid dateExecuted from workflow start '%s': %s", workflowStart, e), skippedExecutionsCsvPrinter, false); + return Optional.empty(); + } + } + + static String getIso8601Date(String workflowStart) throws DateTimeException { + final LocalDateTime localDateTime = LocalDateTime.parse(workflowStart, WORKFLOW_START_FORMAT); + return DateTimeFormatter.ISO_INSTANT.format(localDateTime.atOffset(ZoneOffset.UTC)); + } + + static Optional formatStringInIso8601Date(String workflowStart) { try { - final LocalDateTime localDateTime = LocalDateTime.parse(workflowStart, workflowStartFormat); + final LocalDateTime localDateTime = LocalDateTime.parse(workflowStart, WORKFLOW_START_FORMAT); return Optional.of(DateTimeFormatter.ISO_INSTANT.format(localDateTime.atOffset(ZoneOffset.UTC))); } catch (DateTimeParseException e) { + LOG.error("Could not format workflow_start '{}' in ISO 8601 date format", workflowStart, e); return Optional.empty(); } } private Optional getPrimaryDescriptorAbsolutePath(WorkflowsApi workflowsApi, MinimalWorkflowInfo workflow, String version) { + Optional primaryDescriptorPath = Optional.empty(); try { - return Optional.of(workflowsApi.primaryDescriptor1(workflow.id(), version, workflow.descriptorType().toString()).getAbsolutePath()); + primaryDescriptorPath = Optional.of(workflowsApi.primaryDescriptor1(workflow.id(), version, workflow.descriptorType().toString()).getAbsolutePath()); } catch (ApiException e) { - return Optional.empty(); + LOG.debug("Could not get primary descriptor path", e); } + return primaryDescriptorPath; } @@ -201,10 +220,14 @@ private List getSourceUrlComponents(String sourceUrl) { .toList(); } - private void skipExecution(String sourceUrl, CSVRecord csvRecordToSkip, String reason, CSVPrinter skippedExecutionsCsvPrinter) { - LOG.info("Skipping execution on row {}: {}", csvRecordToSkip.getRecordNumber(), reason); - // Record to map for future reference - sourceUrlsToSkipToReason.put(sourceUrl, reason); + private void skipExecution(String sourceUrl, CSVRecord csvRecordToSkip, String reason, CSVPrinter skippedExecutionsCsvPrinter, boolean skipFutureExecutionsWithSourceUrl) { + LOG.info("Skipping execution on row {} with record {}: {}", csvRecordToSkip.getRecordNumber(), csvRecordToSkip.values(), reason); + + // Record to map for future reference. Only want to do this if the skip reason applies for ALL executions with the source_url. + // Should not add to this map if the skip reason is specific to one execution + if (skipFutureExecutionsWithSourceUrl) { + sourceUrlsToSkipToReason.put(sourceUrl, reason); + } if (submitTerraMetricsCommand.isRecordSkippedExecutions()) { // Record to output CSV file for later examination // Headers: workflow_id, status, workflow_start, workflow_end, workflow_runtime_minutes, source_url @@ -228,21 +251,31 @@ public Optional getTerraWorkflowExecutionFromCsvRecord(CSVRecord c final String workflowRunTimeMinutes = csvRecord.get(TerraMetricsCsvHeaders.workflow_runtime_minutes); // Check that all required fields are non-null - if (executionId == null || workflowStart == null || status == null) { - skipExecution(sourceUrl, csvRecord, "One or more of the required fields (workflow_id, workflow_start, status) is missing", skippedExecutionsCsvPrinter); + if (StringUtils.isBlank(executionId)) { + skipExecution(sourceUrl, csvRecord, "The required field workflow_id is missing", skippedExecutionsCsvPrinter, false); + return Optional.empty(); + } + + if (StringUtils.isBlank(workflowStart)) { + skipExecution(sourceUrl, csvRecord, "The required field workflow_start is missing", skippedExecutionsCsvPrinter, false); + return Optional.empty(); + } + + if (StringUtils.isBlank(status)) { + skipExecution(sourceUrl, csvRecord, "The required field status is missing", skippedExecutionsCsvPrinter, false); return Optional.empty(); } // Format fields into Dockstore schema final Optional executionStatus = getExecutionStatusEnumFromTerraStatus(status); if (executionStatus.isEmpty()) { - skipExecution(sourceUrl, csvRecord, "Could not get a valid ExecutionStatusEnum from status '" + status + "'", skippedExecutionsCsvPrinter); + skipExecution(sourceUrl, csvRecord, "Could not get a valid ExecutionStatusEnum from status '" + status + "'", skippedExecutionsCsvPrinter, false); return Optional.empty(); } - final Optional dateExecuted = formatStringInIso8601Date(workflowStart); + final Optional dateExecuted = getDateExecuted(csvRecord, workflowStart, sourceUrl, skippedExecutionsCsvPrinter); if (dateExecuted.isEmpty()) { - skipExecution(sourceUrl, csvRecord, "Could not get a valid dateExecuted from workflow_start '" + workflowStart + "'", skippedExecutionsCsvPrinter); + skipExecution(sourceUrl, csvRecord, "Could not get a valid dateExecuted from workflow_start '" + workflowStart + "'", skippedExecutionsCsvPrinter, false); return Optional.empty(); } @@ -281,13 +314,13 @@ private Optional calculateTrsInfoFromSourceUrl(CSVRecord workf if (!workflowPathPrefixToWorkflows.containsKey(workflowPathPrefix)) { try { List publishedWorkflowsWithSamePathPrefix = workflowsApi.getAllPublishedWorkflowByPath(workflowPathPrefix).stream() - .map(workflow -> new MinimalWorkflowInfo(workflow.getId(), workflow.getFullWorkflowPath(), workflow.getDescriptorType(), new HashMap<>())) + .map(workflow -> new MinimalWorkflowInfo(workflow.getId(), workflow.getFullWorkflowPath(), workflow.getDescriptorType(), new ConcurrentHashMap<>())) .toList(); workflowPathPrefixToWorkflows.put(workflowPathPrefix, publishedWorkflowsWithSamePathPrefix); } catch (ApiException e) { skipExecution(sourceUrl, workflowMetricRecord, "Could not get all published workflows for workflow path " + workflowPathPrefix + " to determine TRS ID", - skippedExecutionsCsvPrinter); + skippedExecutionsCsvPrinter, true); return Optional.empty(); } } @@ -298,8 +331,8 @@ private Optional calculateTrsInfoFromSourceUrl(CSVRecord workf // Loop through each workflow to find one that matches the primary descriptor workflowsFromSameRepo.forEach(workflow -> { if (!workflow.versionToPrimaryDescriptorPath().containsKey(version)) { - // Intentionally putting null in map to indicate that the primary descriptor path for the workflow version doesn't exist - workflow.versionToPrimaryDescriptorPath().put(version, getPrimaryDescriptorAbsolutePath(workflowsApi, workflow, version).orElse(null)); + // Intentionally putting empty string in map to indicate that the primary descriptor path for the workflow version doesn't exist + workflow.versionToPrimaryDescriptorPath().put(version, getPrimaryDescriptorAbsolutePath(workflowsApi, workflow, version).map(this::makePathAbsolute).orElse("")); } final String primaryDescriptorPathForVersion = workflow.versionToPrimaryDescriptorPath().get(version); @@ -311,12 +344,12 @@ private Optional calculateTrsInfoFromSourceUrl(CSVRecord workf if (foundFullWorkflowPaths.isEmpty()) { - skipExecution(sourceUrl, workflowMetricRecord, "Could not find workflow with primary descriptor " + primaryDescriptorPathFromUrl, skippedExecutionsCsvPrinter); + skipExecution(sourceUrl, workflowMetricRecord, "Could not find workflow with primary descriptor " + primaryDescriptorPathFromUrl, skippedExecutionsCsvPrinter, true); return Optional.empty(); } else if (foundFullWorkflowPaths.size() > 1) { // There is already a workflow in the same repository with the same descriptor path that we're looking for. // Skip this source url because it is an ambiguous case and we can't identify which workflow the source url is referring to. - skipExecution(sourceUrl, workflowMetricRecord, String.format("There's %s workflows in the repository with the same primary descriptor path '%s': %s", foundFullWorkflowPaths.size(), primaryDescriptorPathFromUrl, foundFullWorkflowPaths), skippedExecutionsCsvPrinter); + skipExecution(sourceUrl, workflowMetricRecord, String.format("There's %s workflows in the repository with the same primary descriptor path '%s': %s", foundFullWorkflowPaths.size(), primaryDescriptorPathFromUrl, foundFullWorkflowPaths), skippedExecutionsCsvPrinter, true); return Optional.empty(); } else { final SourceUrlTrsInfo sourceUrlTrsInfo = new SourceUrlTrsInfo(sourceUrl, "#workflow/" + foundFullWorkflowPaths.get(0), version); @@ -324,11 +357,15 @@ private Optional calculateTrsInfoFromSourceUrl(CSVRecord workf } } else { - skipExecution(sourceUrl, workflowMetricRecord, "Not enough components in the source_url to figure out the TRS ID and version", skippedExecutionsCsvPrinter); + skipExecution(sourceUrl, workflowMetricRecord, "Not enough components in the source_url to figure out the TRS ID and version", skippedExecutionsCsvPrinter, true); return Optional.empty(); } } + private String makePathAbsolute(String path) { + return path.startsWith("/") ? path : "/" + path; + } + /** * Record that stores the calculated TRS ID and version, derived from the source_url * @param sourceUrl @@ -338,6 +375,6 @@ private Optional calculateTrsInfoFromSourceUrl(CSVRecord workf public record SourceUrlTrsInfo(String sourceUrl, String trsId, String version) { } - public record MinimalWorkflowInfo(long id, String fullWorkflowPath, DescriptorTypeEnum descriptorType, Map versionToPrimaryDescriptorPath) { + public record MinimalWorkflowInfo(long id, String fullWorkflowPath, DescriptorTypeEnum descriptorType, ConcurrentHashMap versionToPrimaryDescriptorPath) { } } From fdb4c7559ce4c0d60b38a3fe1242f28ce9d3e72a Mon Sep 17 00:00:00 2001 From: Kathy Tran Date: Thu, 14 Dec 2023 13:12:19 -0500 Subject: [PATCH 05/19] Aggregate metrics in parallel --- .../MetricsAggregatorS3Client.java | 103 ++++++++++-------- 1 file changed, 59 insertions(+), 44 deletions(-) diff --git a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/MetricsAggregatorS3Client.java b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/MetricsAggregatorS3Client.java index 92c5b479..f660de51 100644 --- a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/MetricsAggregatorS3Client.java +++ b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/MetricsAggregatorS3Client.java @@ -48,6 +48,9 @@ public class MetricsAggregatorS3Client { private static final Logger LOG = LoggerFactory.getLogger(MetricsAggregatorS3Client.class); private static final Gson GSON = new Gson(); + private int numberOfDirectoriesProcessed = 0; + private int numberOfMetricsSubmitted = 0; + private int numberOfMetricsSkipped = 0; private final String bucketName; @@ -67,60 +70,72 @@ public MetricsAggregatorS3Client(String bucketName, String s3EndpointOverride) t } public void aggregateMetrics(ExtendedGa4GhApi extendedGa4GhApi) { + LOG.info("Getting directories to process..."); List metricsDirectories = getDirectories(); if (metricsDirectories.isEmpty()) { - System.out.println("No directories found to aggregate metrics"); + LOG.info("No directories found to aggregate metrics"); return; } - System.out.println("Aggregating metrics..."); - for (S3DirectoryInfo directoryInfo : metricsDirectories) { - String toolId = directoryInfo.toolId(); - String versionName = directoryInfo.versionId(); - List platforms = directoryInfo.platforms(); - String platformsString = String.join(", ", platforms); - String versionS3KeyPrefix = directoryInfo.versionS3KeyPrefix(); - - // Collect metrics for each platform, so we can calculate metrics across all platforms - List allMetrics = new ArrayList<>(); - for (String platform : platforms) { - ExecutionsRequestBody allSubmissions; - try { - allSubmissions = getExecutions(toolId, versionName, platform); - } catch (Exception e) { - LOG.error("Error aggregating metrics: Could not get all executions from directory {}", versionS3KeyPrefix, e); - continue; // Continue aggregating metrics for other directories - } - - try { - getAggregatedMetrics(allSubmissions).ifPresent(metrics -> { - extendedGa4GhApi.aggregatedMetricsPut(metrics, platform, toolId, versionName); - System.out.printf("Aggregated metrics for tool ID %s, version %s, platform %s from directory %s%n", toolId, versionName, platform, versionS3KeyPrefix); - allMetrics.add(metrics); - }); - } catch (Exception e) { - LOG.error("Error aggregating metrics: Could not put all executions from directory {}", versionS3KeyPrefix, e); - // Continue aggregating metrics for other platforms - } + LOG.info("Aggregating metrics for {} directories", metricsDirectories.size()); + metricsDirectories.stream() + .parallel() + .forEach(directoryInfo -> aggregateMetricsForDirectory(directoryInfo, extendedGa4GhApi)); + LOG.info("Completed aggregating metrics. Processed {} directories, submitted {} metrics, and skipped {} metrics", numberOfDirectoriesProcessed, numberOfMetricsSubmitted, numberOfMetricsSkipped); + } + + private void aggregateMetricsForDirectory(S3DirectoryInfo directoryInfo, ExtendedGa4GhApi extendedGa4GhApi) { + LOG.info("Processing directory {}", directoryInfo); + String toolId = directoryInfo.toolId(); + String versionName = directoryInfo.versionId(); + List platforms = directoryInfo.platforms(); + String platformsString = String.join(", ", platforms); + String versionS3KeyPrefix = directoryInfo.versionS3KeyPrefix(); + + // Collect metrics for each platform, so we can calculate metrics across all platforms + List allMetrics = new ArrayList<>(); + for (String platform : platforms) { + ExecutionsRequestBody allSubmissions; + try { + allSubmissions = getExecutions(toolId, versionName, platform); + } catch (Exception e) { + LOG.error("Error aggregating metrics: Could not get all executions from directory {}", versionS3KeyPrefix, e); + ++numberOfMetricsSkipped; + continue; // Continue aggregating metrics for other directories } - if (!allMetrics.isEmpty()) { - // Calculate metrics across all platforms by aggregating the aggregated metrics from each platform - try { - getAggregatedMetrics(new ExecutionsRequestBody().aggregatedExecutions(allMetrics)).ifPresent(metrics -> { - extendedGa4GhApi.aggregatedMetricsPut(metrics, Partner.ALL.name(), toolId, versionName); - System.out.printf("Aggregated metrics across all platforms (%s) for tool ID %s, version %s from directory %s%n", - platformsString, toolId, versionName, versionS3KeyPrefix); - allMetrics.add(metrics); - }); - } catch (Exception e) { - LOG.error("Error aggregating metrics across all platforms ({}) for tool ID {}, version {} from directory {}", platformsString, toolId, versionName, versionS3KeyPrefix, e); - // Continue aggregating metrics for other directories - } + try { + getAggregatedMetrics(allSubmissions).ifPresent(metrics -> { + extendedGa4GhApi.aggregatedMetricsPut(metrics, platform, toolId, versionName); + System.out.printf("Aggregated metrics for tool ID %s, version %s, platform %s from directory %s%n", toolId, versionName, platform, versionS3KeyPrefix); + allMetrics.add(metrics); + }); + } catch (Exception e) { + LOG.error("Error aggregating metrics: Could not put all executions from directory {}", versionS3KeyPrefix, e); + // Continue aggregating metrics for other platforms + } + } + + if (!allMetrics.isEmpty()) { + // Calculate metrics across all platforms by aggregating the aggregated metrics from each platform + try { + getAggregatedMetrics(new ExecutionsRequestBody().aggregatedExecutions(allMetrics)).ifPresent(metrics -> { + extendedGa4GhApi.aggregatedMetricsPut(metrics, Partner.ALL.name(), toolId, versionName); + System.out.printf("Aggregated metrics across all platforms (%s) for tool ID %s, version %s from directory %s%n", + platformsString, toolId, versionName, versionS3KeyPrefix); + allMetrics.add(metrics); + }); + } catch (Exception e) { + LOG.error("Error aggregating metrics across all platforms ({}) for tool ID {}, version {} from directory {}", platformsString, toolId, versionName, versionS3KeyPrefix, e); + // Continue aggregating metrics for other directories } + ++numberOfMetricsSubmitted; + } else { + ++numberOfMetricsSkipped; } - System.out.println("Completed aggregating metrics"); + ++numberOfDirectoriesProcessed; + LOG.info("Processed {} directories", numberOfDirectoriesProcessed); } /** From 41fdc27830417fe7169a83df6c46495a45a6bedc Mon Sep 17 00:00:00 2001 From: Kathy Tran Date: Thu, 14 Dec 2023 17:41:07 -0500 Subject: [PATCH 06/19] Clean up: add more documentation --- .../client/cli/MetricsAggregatorClient.java | 2 +- .../client/cli/TerraMetricsSubmitter.java | 289 ++++++++++-------- .../client/cli/TerraMetricsSubmitterTest.java | 17 ++ 3 files changed, 183 insertions(+), 125 deletions(-) diff --git a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/MetricsAggregatorClient.java b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/MetricsAggregatorClient.java index 6c7e4c06..7aeaafe5 100644 --- a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/MetricsAggregatorClient.java +++ b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/MetricsAggregatorClient.java @@ -143,7 +143,7 @@ public static void main(String[] args) { final MetricsAggregatorConfig metricsAggregatorConfig = new MetricsAggregatorConfig(config.get()); final TerraMetricsSubmitter submitTerraMetricsCommand = new TerraMetricsSubmitter(metricsAggregatorConfig, submitTerraMetrics); - submitTerraMetricsCommand.submitMetrics(); + submitTerraMetricsCommand.submitTerraMetrics(); } catch (Exception e) { LOG.error("Could not submit terra metrics to Dockstore", e); System.exit(FAILURE_EXIT_CODE); diff --git a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java index 7490ba4b..12b73eb2 100644 --- a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java +++ b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java @@ -21,7 +21,6 @@ import java.io.FileWriter; import java.io.IOException; import java.nio.charset.StandardCharsets; -import java.time.DateTimeException; import java.time.Instant; import java.time.LocalDateTime; import java.time.ZoneOffset; @@ -64,9 +63,10 @@ public class TerraMetricsSubmitter { private final AtomicInteger numberOfExecutionsSkipped = new AtomicInteger(0); // Keep track of sourceUrls that are found to be ambiguous - private final Map sourceUrlsToSkipToReason = new ConcurrentHashMap<>(); + private final Map skippedSourceUrlsToReason = new ConcurrentHashMap<>(); // Map of source url to TRS info that was previously calculated private final Map sourceUrlToSourceUrlTrsInfo = new ConcurrentHashMap<>(); + // Map of workflow path prefixes, like github.com/organization/repo, to published workflows with the same workflow path prefix private final Map> workflowPathPrefixToWorkflows = new ConcurrentHashMap<>(); public TerraMetricsSubmitter(MetricsAggregatorConfig config, SubmitTerraMetrics submitTerraMetricsCommand) { @@ -74,13 +74,13 @@ public TerraMetricsSubmitter(MetricsAggregatorConfig config, SubmitTerraMetrics this.submitTerraMetricsCommand = submitTerraMetricsCommand; } - public void submitMetrics() { + public void submitTerraMetrics() { ApiClient apiClient = setupApiClient(config.getDockstoreServerUrl(), config.getDockstoreToken()); ExtendedGa4GhApi extendedGa4GhApi = new ExtendedGa4GhApi(apiClient); WorkflowsApi workflowsApi = new WorkflowsApi(apiClient); // Read CSV file - Iterable workflowMetricRecords = null; + Iterable workflowMetricRecords; final String inputDateFilePath = this.submitTerraMetricsCommand.getDataFilePath(); try (BufferedReader metricsBufferedReader = new BufferedReader(new FileReader(inputDateFilePath))) { final CSVFormat csvFormat = CSVFormat.DEFAULT.builder() @@ -90,10 +90,15 @@ public void submitMetrics() { .build(); workflowMetricRecords = csvFormat.parse(metricsBufferedReader); + // This output file is used to record skipped executions final String outputFileName = inputDateFilePath + "_skipped_executions_" + Instant.now().truncatedTo(ChronoUnit.SECONDS).toString().replace("-", "").replace(":", "") + ".csv"; List workflowMetricsToProcess = new ArrayList<>(); - try (CSVPrinter skippedExecutionsCsvPrinter = submitTerraMetricsCommand.isRecordSkippedExecutions() ? new CSVPrinter(new FileWriter(outputFileName, StandardCharsets.UTF_8), CSVFormat.DEFAULT.builder().setHeader(SkippedTerraMetricsCsvHeaders.class).build()) : null) { + try (CSVPrinter skippedExecutionsCsvPrinter = submitTerraMetricsCommand.isRecordSkippedExecutions() ? new CSVPrinter( + new FileWriter(outputFileName, StandardCharsets.UTF_8), + CSVFormat.DEFAULT.builder().setHeader(SkippedTerraMetricsCsvHeaders.class).build()) : null) { + + // Process the executions by reading 100,000 rows, then processing the rows in parallel for (CSVRecord workflowMetricRecord: workflowMetricRecords) { final int batchSize = 100000; if (workflowMetricsToProcess.size() < batchSize && workflowMetricRecords.iterator().hasNext()) { @@ -126,24 +131,35 @@ public void submitMetrics() { } private void logStats() { - LOG.info("New way: Done processing {} executions from Terra. Submitted {} executions. Skipped {} executions.", numberOfExecutionsProcessed, numberOfExecutionsSubmitted, numberOfExecutionsSkipped); + LOG.info("Done processing {} executions from Terra. Submitted {} executions. Skipped {} executions.", numberOfExecutionsProcessed, numberOfExecutionsSubmitted, numberOfExecutionsSkipped); } + /** + * Processes the workflow execution from the CSV record and submits the execution to Dockstore if it is valid. + * @param workflowMetricRecord + * @param workflowsApi + * @param extendedGa4GhApi + * @param skippedExecutionsCsvPrinter + */ private void processWorkflowExecution(CSVRecord workflowMetricRecord, WorkflowsApi workflowsApi, ExtendedGa4GhApi extendedGa4GhApi, CSVPrinter skippedExecutionsCsvPrinter) { final String sourceUrl = workflowMetricRecord.get(TerraMetricsCsvHeaders.source_url); LOG.info("Processing execution on row {} with source_url {}", workflowMetricRecord.getRecordNumber(), sourceUrl); numberOfExecutionsProcessed.incrementAndGet(); + if (StringUtils.isBlank(sourceUrl)) { + logSkippedExecution("", workflowMetricRecord, "Can't determine TRS ID because source_url is missing", skippedExecutionsCsvPrinter, false); + } + // Check to see if this source_url was skipped before - if (sourceUrlsToSkipToReason.containsKey(sourceUrl)) { - skipExecution(sourceUrl, workflowMetricRecord, sourceUrlsToSkipToReason.get(sourceUrl), skippedExecutionsCsvPrinter, true); + if (skippedSourceUrlsToReason.containsKey(sourceUrl)) { + logSkippedExecution(sourceUrl, workflowMetricRecord, skippedSourceUrlsToReason.get(sourceUrl), skippedExecutionsCsvPrinter, true); return; } - // Check to see if we need to figure out the TRS ID for the source URL + // Check if the information from the workflow execution is valid Optional workflowExecution = getTerraWorkflowExecutionFromCsvRecord(workflowMetricRecord, sourceUrl, skippedExecutionsCsvPrinter); if (workflowExecution.isEmpty()) { - return; // Row is invalid, don't figure out TRS ID + return; } if (!sourceUrlToSourceUrlTrsInfo.containsKey(sourceUrl)) { @@ -162,71 +178,27 @@ private void processWorkflowExecution(CSVRecord workflowMetricRecord, WorkflowsA "Terra metrics from Q4 2023"); numberOfExecutionsSubmitted.incrementAndGet(); } catch (ApiException e) { - skipExecution(sourceUrl, workflowMetricRecord, String.format("Could not submit execution metrics to Dockstore for workflow %s: %s", sourceUrlTrsInfo, e.getMessage()), skippedExecutionsCsvPrinter, false); - } - } - - static Optional getExecutionStatusEnumFromTerraStatus(String terraStatus) { - ExecutionStatusEnum executionStatusEnum = switch (terraStatus) { - case "Succeeded" -> ExecutionStatusEnum.SUCCESSFUL; - case "Failed" -> ExecutionStatusEnum.FAILED; - case "Aborted" -> ExecutionStatusEnum.ABORTED; - default -> null; - }; - return Optional.ofNullable(executionStatusEnum); - } - - private Optional getDateExecuted(CSVRecord csvRecord, String workflowStart, String sourceUrl, CSVPrinter skippedExecutionsCsvPrinter) { - try { - return Optional.of(getIso8601Date(workflowStart)); - } catch (DateTimeException e) { - skipExecution(sourceUrl, csvRecord, String.format("Could not get valid dateExecuted from workflow start '%s': %s", workflowStart, e), skippedExecutionsCsvPrinter, false); - return Optional.empty(); + logSkippedExecution(sourceUrl, workflowMetricRecord, String.format("Could not submit execution metrics to Dockstore for workflow %s: %s", sourceUrlTrsInfo, e.getMessage()), skippedExecutionsCsvPrinter, false); } } - static String getIso8601Date(String workflowStart) throws DateTimeException { - final LocalDateTime localDateTime = LocalDateTime.parse(workflowStart, WORKFLOW_START_FORMAT); - return DateTimeFormatter.ISO_INSTANT.format(localDateTime.atOffset(ZoneOffset.UTC)); - } - - - static Optional formatStringInIso8601Date(String workflowStart) { - try { - final LocalDateTime localDateTime = LocalDateTime.parse(workflowStart, WORKFLOW_START_FORMAT); - return Optional.of(DateTimeFormatter.ISO_INSTANT.format(localDateTime.atOffset(ZoneOffset.UTC))); - } catch (DateTimeParseException e) { - LOG.error("Could not format workflow_start '{}' in ISO 8601 date format", workflowStart, e); - return Optional.empty(); - } - } - - private Optional getPrimaryDescriptorAbsolutePath(WorkflowsApi workflowsApi, MinimalWorkflowInfo workflow, String version) { - Optional primaryDescriptorPath = Optional.empty(); - try { - primaryDescriptorPath = Optional.of(workflowsApi.primaryDescriptor1(workflow.id(), version, workflow.descriptorType().toString()).getAbsolutePath()); - } catch (ApiException e) { - LOG.debug("Could not get primary descriptor path", e); - } - return primaryDescriptorPath; - } - - - private List getSourceUrlComponents(String sourceUrl) { - final String rawGitHubUserContentUrlPrefix = "https://raw.githubusercontent.com/"; // The source_url starts with this - final String sourceUrlWithoutGitHubPrefix = sourceUrl.replace(rawGitHubUserContentUrlPrefix, ""); - return Arrays.stream(sourceUrlWithoutGitHubPrefix.split("/")) - .filter(urlComponent -> !urlComponent.isEmpty()) // Filter out empty strings that are a result of consecutive slashes '//' - .toList(); - } - - private void skipExecution(String sourceUrl, CSVRecord csvRecordToSkip, String reason, CSVPrinter skippedExecutionsCsvPrinter, boolean skipFutureExecutionsWithSourceUrl) { - LOG.info("Skipping execution on row {} with record {}: {}", csvRecordToSkip.getRecordNumber(), csvRecordToSkip.values(), reason); + /** + * Performs logging and writing of the skipped execution to an output file. + * If skipFutureExecutionsWithSourceUrl is true, also adds the source_url of the skipped execution to the sourceUrlsToSkipToReason map + * so that future executions with the same source_url are skipped. + * @param sourceUrl source_url of the csvRecordToSkip + * @param csvRecordToSkip CSVRecord to skip + * @param reason Reason that this execution is being skipped + * @param skippedExecutionsCsvPrinter CSVPrinter that writes to the output file + * @param skipFutureExecutionsWithSourceUrl boolean indicating if all executions with the same source_url should be skipped + */ + private void logSkippedExecution(String sourceUrl, CSVRecord csvRecordToSkip, String reason, CSVPrinter skippedExecutionsCsvPrinter, boolean skipFutureExecutionsWithSourceUrl) { + LOG.info("Skipping execution on row {}: {}", csvRecordToSkip.getRecordNumber(), reason); // Record to map for future reference. Only want to do this if the skip reason applies for ALL executions with the source_url. // Should not add to this map if the skip reason is specific to one execution if (skipFutureExecutionsWithSourceUrl) { - sourceUrlsToSkipToReason.put(sourceUrl, reason); + skippedSourceUrlsToReason.put(sourceUrl, reason); } if (submitTerraMetricsCommand.isRecordSkippedExecutions()) { // Record to output CSV file for later examination @@ -244,38 +216,47 @@ private void skipExecution(String sourceUrl, CSVRecord csvRecordToSkip, String r numberOfExecutionsSkipped.incrementAndGet(); } + /** + * Gets a RunExecution representing a single Terra workflow execution from one row of the CSV file. + * If the CSV record is invalid, the function will record the reason why the execution was skipped using skippedExecutionsCsvPrinter. + * Note: If an execution is skipped in this function, it means that the reason is specific to the execution, not the source_url! + * @param csvRecord + * @param sourceUrl + * @param skippedExecutionsCsvPrinter + * @return + */ public Optional getTerraWorkflowExecutionFromCsvRecord(CSVRecord csvRecord, String sourceUrl, CSVPrinter skippedExecutionsCsvPrinter) { final String executionId = csvRecord.get(TerraMetricsCsvHeaders.workflow_id); final String workflowStart = csvRecord.get(TerraMetricsCsvHeaders.workflow_start); final String status = csvRecord.get(TerraMetricsCsvHeaders.status); final String workflowRunTimeMinutes = csvRecord.get(TerraMetricsCsvHeaders.workflow_runtime_minutes); - // Check that all required fields are non-null + // Check that all required fields are present if (StringUtils.isBlank(executionId)) { - skipExecution(sourceUrl, csvRecord, "The required field workflow_id is missing", skippedExecutionsCsvPrinter, false); + logSkippedExecution(sourceUrl, csvRecord, "The required field workflow_id is missing", skippedExecutionsCsvPrinter, false); return Optional.empty(); } if (StringUtils.isBlank(workflowStart)) { - skipExecution(sourceUrl, csvRecord, "The required field workflow_start is missing", skippedExecutionsCsvPrinter, false); + logSkippedExecution(sourceUrl, csvRecord, "The required field workflow_start is missing", skippedExecutionsCsvPrinter, false); return Optional.empty(); } if (StringUtils.isBlank(status)) { - skipExecution(sourceUrl, csvRecord, "The required field status is missing", skippedExecutionsCsvPrinter, false); + logSkippedExecution(sourceUrl, csvRecord, "The required field status is missing", skippedExecutionsCsvPrinter, false); return Optional.empty(); } // Format fields into Dockstore schema final Optional executionStatus = getExecutionStatusEnumFromTerraStatus(status); if (executionStatus.isEmpty()) { - skipExecution(sourceUrl, csvRecord, "Could not get a valid ExecutionStatusEnum from status '" + status + "'", skippedExecutionsCsvPrinter, false); + logSkippedExecution(sourceUrl, csvRecord, "Could not get a valid ExecutionStatusEnum from status '" + status + "'", skippedExecutionsCsvPrinter, false); return Optional.empty(); } - final Optional dateExecuted = getDateExecuted(csvRecord, workflowStart, sourceUrl, skippedExecutionsCsvPrinter); + final Optional dateExecuted = formatStringInIso8601Date(workflowStart); if (dateExecuted.isEmpty()) { - skipExecution(sourceUrl, csvRecord, "Could not get a valid dateExecuted from workflow_start '" + workflowStart + "'", skippedExecutionsCsvPrinter, false); + logSkippedExecution(sourceUrl, csvRecord, "Could not get a valid dateExecuted from workflow_start '" + workflowStart + "'", skippedExecutionsCsvPrinter, false); return Optional.empty(); } @@ -288,6 +269,26 @@ public Optional getTerraWorkflowExecutionFromCsvRecord(CSVRecord c return Optional.of(workflowExecution); } + static Optional formatStringInIso8601Date(String workflowStart) { + try { + final LocalDateTime localDateTime = LocalDateTime.parse(workflowStart, WORKFLOW_START_FORMAT); + return Optional.of(DateTimeFormatter.ISO_INSTANT.format(localDateTime.atOffset(ZoneOffset.UTC))); + } catch (DateTimeParseException e) { + LOG.error("Could not format workflow_start '{}' in ISO 8601 date format", workflowStart, e); + return Optional.empty(); + } + } + + static Optional getExecutionStatusEnumFromTerraStatus(String terraStatus) { + ExecutionStatusEnum executionStatusEnum = switch (terraStatus) { + case "Succeeded" -> ExecutionStatusEnum.SUCCESSFUL; + case "Failed" -> ExecutionStatusEnum.FAILED; + case "Aborted" -> ExecutionStatusEnum.ABORTED; + default -> null; + }; + return Optional.ofNullable(executionStatusEnum); + } + static Optional getExecutionTime(String workflowRunTimeMinutes) { if (StringUtils.isBlank(workflowRunTimeMinutes)) { return Optional.empty(); @@ -295,6 +296,20 @@ static Optional getExecutionTime(String workflowRunTimeMinutes) { return Optional.of(String.format("PT%sM", workflowRunTimeMinutes)); } + /** + * Calculates the TRS ID from the source_url by: + *
    + *
  1. Looking at all published workflows that have the same workflow path prefix, i.e. they belong to the same GitHub repository.
  2. + *
  3. For each published workflow, getting the primary descriptor path for the version specified in source_url and checking if it matches the primary desciptor path in the source_url
  4. + *
  5. Ensuring that there is only one workflow in the repository with the same descriptor path. If there are multiple, it is an ambiguous case and we skip the execution
  6. + *
+ * + * @param workflowMetricRecord + * @param sourceUrl + * @param workflowsApi + * @param skippedExecutionsCsvPrinter + * @return + */ private Optional calculateTrsInfoFromSourceUrl(CSVRecord workflowMetricRecord, String sourceUrl, WorkflowsApi workflowsApi, CSVPrinter skippedExecutionsCsvPrinter) { // Need to figure out the TRS ID and version name using the source_url. // Example source_url: https://raw.githubusercontent.com/theiagen/public_health_viral_genomics/v2.0.0/workflows/wf_theiacov_fasta.wdl @@ -302,67 +317,93 @@ private Optional calculateTrsInfoFromSourceUrl(CSVRecord workf // Note that the TRS ID may also have a workflow name, which we need to figure out final List sourceUrlComponents = getSourceUrlComponents(sourceUrl); - // There should be at least three elements in order for there to be an organization name, foo>/, and version - // in /// + final int minNumberOfComponents = 3; - if (sourceUrlComponents.size() >= minNumberOfComponents) { - final String organization = sourceUrlComponents.get(0) + "/" + sourceUrlComponents.get(1); - final String version = sourceUrlComponents.get(2); - final String primaryDescriptorPathFromUrl = "/" + String.join("/", sourceUrlComponents.subList(3, sourceUrlComponents.size())); - - final String workflowPathPrefix = "github.com/" + organization; - if (!workflowPathPrefixToWorkflows.containsKey(workflowPathPrefix)) { - try { - List publishedWorkflowsWithSamePathPrefix = workflowsApi.getAllPublishedWorkflowByPath(workflowPathPrefix).stream() - .map(workflow -> new MinimalWorkflowInfo(workflow.getId(), workflow.getFullWorkflowPath(), workflow.getDescriptorType(), new ConcurrentHashMap<>())) - .toList(); - workflowPathPrefixToWorkflows.put(workflowPathPrefix, publishedWorkflowsWithSamePathPrefix); - } catch (ApiException e) { - skipExecution(sourceUrl, workflowMetricRecord, - "Could not get all published workflows for workflow path " + workflowPathPrefix + " to determine TRS ID", - skippedExecutionsCsvPrinter, true); - return Optional.empty(); - } - } + if (sourceUrlComponents.size() < minNumberOfComponents) { + logSkippedExecution(sourceUrl, workflowMetricRecord, "Not enough components in the source_url to figure out the TRS ID and version", skippedExecutionsCsvPrinter, true); + return Optional.empty(); + } - List workflowsFromSameRepo = workflowPathPrefixToWorkflows.get(workflowPathPrefix); + // There should be at least three elements in order for there to be an organization name, foo>/, and version + // in /// + final String organization = sourceUrlComponents.get(0) + "/" + sourceUrlComponents.get(1); + final String version = sourceUrlComponents.get(2); + final String primaryDescriptorPathFromUrl = "/" + String.join("/", sourceUrlComponents.subList(3, sourceUrlComponents.size())); - List foundFullWorkflowPaths = new ArrayList<>(); - // Loop through each workflow to find one that matches the primary descriptor - workflowsFromSameRepo.forEach(workflow -> { - if (!workflow.versionToPrimaryDescriptorPath().containsKey(version)) { - // Intentionally putting empty string in map to indicate that the primary descriptor path for the workflow version doesn't exist - workflow.versionToPrimaryDescriptorPath().put(version, getPrimaryDescriptorAbsolutePath(workflowsApi, workflow, version).map(this::makePathAbsolute).orElse("")); - } + final String workflowPathPrefix = "github.com/" + organization; + if (!workflowPathPrefixToWorkflows.containsKey(workflowPathPrefix)) { + try { + List publishedWorkflowsWithSamePathPrefix = workflowsApi.getAllPublishedWorkflowByPath( + workflowPathPrefix).stream() + .map(workflow -> new MinimalWorkflowInfo(workflow.getId(), workflow.getFullWorkflowPath(), workflow.getDescriptorType(), new ConcurrentHashMap<>())).toList(); + workflowPathPrefixToWorkflows.put(workflowPathPrefix, publishedWorkflowsWithSamePathPrefix); + } catch (ApiException e) { + logSkippedExecution(sourceUrl, workflowMetricRecord, + "Could not get all published workflows for workflow path " + workflowPathPrefix + " to determine TRS ID", + skippedExecutionsCsvPrinter, true); + return Optional.empty(); + } + } - final String primaryDescriptorPathForVersion = workflow.versionToPrimaryDescriptorPath().get(version); - if (primaryDescriptorPathFromUrl.equals(primaryDescriptorPathForVersion)) { - // Collect a list of workflow paths that have the primary descriptor path we're looking for - foundFullWorkflowPaths.add(workflow.fullWorkflowPath()); - } - }); + List workflowsFromSameRepo = workflowPathPrefixToWorkflows.get(workflowPathPrefix); + List foundFullWorkflowPaths = new ArrayList<>(); + // Loop through each workflow to find one that matches the primary descriptor + workflowsFromSameRepo.forEach(workflow -> { + // Get the primary descriptor path for the version and update the map, either with the primary descriptor path or an empty string to indicate that it was not found + if (!workflow.versionToPrimaryDescriptorPath().containsKey(version)) { + final String primaryDescriptorAbsolutePath = makePathAbsolute(getPrimaryDescriptorAbsolutePath(workflowsApi, workflow, version).orElse("")); + workflow.versionToPrimaryDescriptorPath().put(version, primaryDescriptorAbsolutePath); + } - if (foundFullWorkflowPaths.isEmpty()) { - skipExecution(sourceUrl, workflowMetricRecord, "Could not find workflow with primary descriptor " + primaryDescriptorPathFromUrl, skippedExecutionsCsvPrinter, true); - return Optional.empty(); - } else if (foundFullWorkflowPaths.size() > 1) { - // There is already a workflow in the same repository with the same descriptor path that we're looking for. - // Skip this source url because it is an ambiguous case and we can't identify which workflow the source url is referring to. - skipExecution(sourceUrl, workflowMetricRecord, String.format("There's %s workflows in the repository with the same primary descriptor path '%s': %s", foundFullWorkflowPaths.size(), primaryDescriptorPathFromUrl, foundFullWorkflowPaths), skippedExecutionsCsvPrinter, true); - return Optional.empty(); - } else { - final SourceUrlTrsInfo sourceUrlTrsInfo = new SourceUrlTrsInfo(sourceUrl, "#workflow/" + foundFullWorkflowPaths.get(0), version); - return Optional.of(sourceUrlTrsInfo); + // Check to see if there's a version that has the same primary descriptor path + final String primaryDescriptorPathForVersion = workflow.versionToPrimaryDescriptorPath().get(version); + if (primaryDescriptorPathFromUrl.equals(primaryDescriptorPathForVersion)) { + foundFullWorkflowPaths.add(workflow.fullWorkflowPath()); } + }); - } else { - skipExecution(sourceUrl, workflowMetricRecord, "Not enough components in the source_url to figure out the TRS ID and version", skippedExecutionsCsvPrinter, true); + if (foundFullWorkflowPaths.isEmpty()) { + logSkippedExecution(sourceUrl, workflowMetricRecord, "Could not find workflow with primary descriptor " + primaryDescriptorPathFromUrl, skippedExecutionsCsvPrinter, + true); return Optional.empty(); + } else if (foundFullWorkflowPaths.size() > 1) { + // There is already a workflow in the same repository with the same descriptor path that we're looking for. + // Skip this source_url because it is an ambiguous case and we can't identify which workflow the source url is referring to. + logSkippedExecution(sourceUrl, workflowMetricRecord, String.format("There's %s workflows in the repository with the same primary descriptor path '%s': %s", + foundFullWorkflowPaths.size(), primaryDescriptorPathFromUrl, foundFullWorkflowPaths), skippedExecutionsCsvPrinter, true); + return Optional.empty(); + } else { + final SourceUrlTrsInfo sourceUrlTrsInfo = new SourceUrlTrsInfo(sourceUrl, "#workflow/" + foundFullWorkflowPaths.get(0), version); + return Optional.of(sourceUrlTrsInfo); } } - private String makePathAbsolute(String path) { + /** + * Returns a list of slash-delimited components from the source_url. + * Example: given source_url https://raw.githubusercontent.com/theiagen/public_health_viral_genomics/v2.0.0/workflows/wf_theiacov_fasta.wdl, + * returns a list of "theiagen", "public_health_viral_genomics", "v2.0.0", "wf_theiacov_fasta.wdl" + * @param sourceUrl The source_url that starts with https://raw.githubusercontent.com/. + * @return + */ + static List getSourceUrlComponents(String sourceUrl) { + final String sourceUrlWithoutGitHubPrefix = sourceUrl.replace("https://raw.githubusercontent.com/", ""); + return Arrays.stream(sourceUrlWithoutGitHubPrefix.split("/")) + .filter(urlComponent -> !urlComponent.isEmpty()) // Filter out empty strings that are a result of consecutive slashes '//' + .toList(); + } + + private Optional getPrimaryDescriptorAbsolutePath(WorkflowsApi workflowsApi, MinimalWorkflowInfo workflow, String version) { + Optional primaryDescriptorPath = Optional.empty(); + try { + primaryDescriptorPath = Optional.of(workflowsApi.primaryDescriptor1(workflow.id(), version, workflow.descriptorType().toString()).getAbsolutePath()); + } catch (ApiException e) { + LOG.debug("Could not get primary descriptor path", e); + } + return primaryDescriptorPath; + } + + static String makePathAbsolute(String path) { return path.startsWith("/") ? path : "/" + path; } diff --git a/metricsaggregator/src/test/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitterTest.java b/metricsaggregator/src/test/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitterTest.java index 8b038362..50b431c4 100644 --- a/metricsaggregator/src/test/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitterTest.java +++ b/metricsaggregator/src/test/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitterTest.java @@ -2,10 +2,13 @@ import static io.dockstore.metricsaggregator.client.cli.TerraMetricsSubmitter.formatStringInIso8601Date; import static io.dockstore.metricsaggregator.client.cli.TerraMetricsSubmitter.getExecutionTime; +import static io.dockstore.metricsaggregator.client.cli.TerraMetricsSubmitter.getSourceUrlComponents; +import static io.dockstore.metricsaggregator.client.cli.TerraMetricsSubmitter.makePathAbsolute; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; import io.dockstore.openapi.client.model.RunExecution.ExecutionStatusEnum; +import java.util.List; import org.junit.jupiter.api.Test; class TerraMetricsSubmitterTest { @@ -37,4 +40,18 @@ void testGetExecutionTime() { assertTrue(getExecutionTime(" ").isEmpty()); assertTrue(getExecutionTime(null).isEmpty()); } + + @Test + void testGetSourceUrlComponents() { + assertEquals(List.of("theiagen", "public_health_viral_genomics", "v2.0.0", "workflows", "wf_theiacov_fasta.wdl"), getSourceUrlComponents("https://raw.githubusercontent.com/theiagen/public_health_viral_genomics/v2.0.0/workflows/wf_theiacov_fasta.wdl")); + // This source_url has consecutive slashes + assertEquals(List.of("theiagen", "public_health_viral_genomics", "v2.0.0", "workflows", "wf_theiacov_fasta.wdl"), getSourceUrlComponents("https://raw.githubusercontent.com/theiagen/public_health_viral_genomics/v2.0.0//workflows/wf_theiacov_fasta.wdl")); + assertEquals(List.of(), getSourceUrlComponents("https://raw.githubusercontent.com/theiagen/public_health_viral_genomics/v2.0.0//workflows/wf_theiacov_fasta.wdl")); + } + + @Test + void testMakePathAbsolute() { + assertEquals("/foo.wdl", makePathAbsolute("foo.wdl")); + assertEquals("/foo.wdl", makePathAbsolute("/foo.wdl")); + } } From 3d8f7855d599412e3b09287c809652dc815184bb Mon Sep 17 00:00:00 2001 From: Kathy Tran Date: Wed, 3 Jan 2024 10:00:06 -0500 Subject: [PATCH 07/19] Don't check execution first --- .../client/cli/TerraMetricsSubmitter.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java index 12b73eb2..dbc43bb4 100644 --- a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java +++ b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java @@ -156,12 +156,6 @@ private void processWorkflowExecution(CSVRecord workflowMetricRecord, WorkflowsA return; } - // Check if the information from the workflow execution is valid - Optional workflowExecution = getTerraWorkflowExecutionFromCsvRecord(workflowMetricRecord, sourceUrl, skippedExecutionsCsvPrinter); - if (workflowExecution.isEmpty()) { - return; - } - if (!sourceUrlToSourceUrlTrsInfo.containsKey(sourceUrl)) { Optional sourceUrlTrsInfo = calculateTrsInfoFromSourceUrl(workflowMetricRecord, sourceUrl, workflowsApi, skippedExecutionsCsvPrinter); if (sourceUrlTrsInfo.isEmpty()) { @@ -171,6 +165,12 @@ private void processWorkflowExecution(CSVRecord workflowMetricRecord, WorkflowsA } } + // Check if the information from the workflow execution is valid + Optional workflowExecution = getTerraWorkflowExecutionFromCsvRecord(workflowMetricRecord, sourceUrl, skippedExecutionsCsvPrinter); + if (workflowExecution.isEmpty()) { + return; + } + final SourceUrlTrsInfo sourceUrlTrsInfo = sourceUrlToSourceUrlTrsInfo.get(sourceUrl); final ExecutionsRequestBody executionsRequestBody = new ExecutionsRequestBody().runExecutions(List.of(workflowExecution.get())); try { From 6b84efeb1b512baf79d0b8880d07696118c679c0 Mon Sep 17 00:00:00 2001 From: Kathy Tran Date: Sun, 7 Jan 2024 20:19:02 -0500 Subject: [PATCH 08/19] Group by source url - much quicker! --- .../client/cli/TerraMetricsSubmitter.java | 161 +++++++++++++++++- 1 file changed, 156 insertions(+), 5 deletions(-) diff --git a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java index dbc43bb4..d2062371 100644 --- a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java +++ b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java @@ -2,6 +2,7 @@ import static io.dockstore.utils.CLIConstants.FAILURE_EXIT_CODE; import static io.dockstore.utils.DockstoreApiClientUtils.setupApiClient; +import static java.util.stream.Collectors.groupingBy; import io.dockstore.common.Partner; import io.dockstore.metricsaggregator.MetricsAggregatorConfig; @@ -105,11 +106,28 @@ public void submitTerraMetrics() { workflowMetricsToProcess.add(workflowMetricRecord); } else { workflowMetricsToProcess.add(workflowMetricRecord); + // Collect a map of CSV records with the same source URL + Map> sourceUrlToCsvRecords = workflowMetricsToProcess.stream() + .collect(groupingBy(csvRecord -> csvRecord.get(TerraMetricsCsvHeaders.source_url))); + /* + for (CSVRecord workflowMetricToProcess: workflowMetricsToProcess) { + final String csvRecordSourceUrl = workflowMetricToProcess.get(TerraMetricsCsvHeaders.source_url); + List csvRecordsWithSameSourceUrl = sourceUrlToCsvRecords.getOrDefault(csvRecordSourceUrl, new ArrayList<>()); + csvRecordsWithSameSourceUrl.add(workflowMetricToProcess); + sourceUrlToCsvRecords.put(csvRecordSourceUrl, csvRecordsWithSameSourceUrl); + } + */ + LOG.info("Processing rows {} to {}", workflowMetricsToProcess.get(0).getRecordNumber(), workflowMetricsToProcess.get(workflowMetricsToProcess.size() - 1).getRecordNumber()); - workflowMetricsToProcess.stream() + sourceUrlToCsvRecords.entrySet().stream() .parallel() - .forEach(record -> processWorkflowExecution(record, workflowsApi, extendedGa4GhApi, skippedExecutionsCsvPrinter)); + .forEach(entry -> { + final String sourceUrl = entry.getKey(); + final List csvRecordsWithSameSourceUrl = entry.getValue(); + submitWorkflowExecutions(sourceUrl, csvRecordsWithSameSourceUrl, workflowsApi, extendedGa4GhApi, skippedExecutionsCsvPrinter); + }); + workflowMetricsToProcess.clear(); logStats(); } @@ -157,12 +175,15 @@ private void processWorkflowExecution(CSVRecord workflowMetricRecord, WorkflowsA } if (!sourceUrlToSourceUrlTrsInfo.containsKey(sourceUrl)) { + LOG.info("Calculating TRS info for source_url {}", sourceUrl); Optional sourceUrlTrsInfo = calculateTrsInfoFromSourceUrl(workflowMetricRecord, sourceUrl, workflowsApi, skippedExecutionsCsvPrinter); if (sourceUrlTrsInfo.isEmpty()) { return; } else { sourceUrlToSourceUrlTrsInfo.put(sourceUrl, sourceUrlTrsInfo.get()); } + } else { + LOG.info("TRS info for source_url {} was previously calculated", sourceUrl); } // Check if the information from the workflow execution is valid @@ -182,6 +203,54 @@ private void processWorkflowExecution(CSVRecord workflowMetricRecord, WorkflowsA } } + private void submitWorkflowExecutions(String sourceUrl, List workflowMetricRecords, WorkflowsApi workflowsApi, ExtendedGa4GhApi extendedGa4GhApi, CSVPrinter skippedExecutionsCsvPrinter) { + LOG.info("Processing source_url {} for {} executions", sourceUrl, workflowMetricRecords.size()); + numberOfExecutionsProcessed.addAndGet(workflowMetricRecords.size()); + + if (StringUtils.isBlank(sourceUrl)) { + workflowMetricRecords.forEach(workflowMetricRecord -> { + logSkippedExecution("", workflowMetricRecord, "Can't determine TRS ID because source_url is missing", skippedExecutionsCsvPrinter, false); + }); + } + + // Check to see if this source_url was skipped before + if (skippedSourceUrlsToReason.containsKey(sourceUrl)) { + workflowMetricRecords.forEach(workflowMetricRecord -> { + logSkippedExecution(sourceUrl, workflowMetricRecord, skippedSourceUrlsToReason.get(sourceUrl), skippedExecutionsCsvPrinter, true); + }); + return; + } + + if (!sourceUrlToSourceUrlTrsInfo.containsKey(sourceUrl)) { + Optional sourceUrlTrsInfo = calculateTrsInfoFromSourceUrl(sourceUrl, workflowsApi); + if (sourceUrlTrsInfo.isEmpty()) { + workflowMetricRecords.forEach(workflowMetricRecord -> { + logSkippedExecution(sourceUrl, workflowMetricRecord, "Could not calculate TRS info", skippedExecutionsCsvPrinter, true); + }); + return; + } else { + sourceUrlToSourceUrlTrsInfo.put(sourceUrl, sourceUrlTrsInfo.get()); + } + } + + final SourceUrlTrsInfo sourceUrlTrsInfo = sourceUrlToSourceUrlTrsInfo.get(sourceUrl); + final List workflowExecutionsToSubmit = workflowMetricRecords.stream() + .map(workflowExecution -> getTerraWorkflowExecutionFromCsvRecord(workflowExecution, sourceUrlTrsInfo.sourceUrl(), skippedExecutionsCsvPrinter)) + .filter(Optional::isPresent) + .map(Optional::get) + .toList(); + final ExecutionsRequestBody executionsRequestBody = new ExecutionsRequestBody().runExecutions(workflowExecutionsToSubmit); + try { + extendedGa4GhApi.executionMetricsPost(executionsRequestBody, Partner.TERRA.toString(), sourceUrlTrsInfo.trsId(), sourceUrlTrsInfo.version(), + "Terra metrics from Q4 2023"); + numberOfExecutionsSubmitted.addAndGet(workflowMetricRecords.size()); + } catch (ApiException e) { + workflowMetricRecords.forEach(workflowMetricRecord -> { + logSkippedExecution(sourceUrlTrsInfo.sourceUrl(), workflowMetricRecord, String.format("Could not submit execution metrics to Dockstore for workflow %s: %s", sourceUrlTrsInfo, e.getMessage()), skippedExecutionsCsvPrinter, false); + }); + } + } + /** * Performs logging and writing of the skipped execution to an output file. * If skipFutureExecutionsWithSourceUrl is true, also adds the source_url of the skipped execution to the sourceUrlsToSkipToReason map @@ -193,8 +262,6 @@ private void processWorkflowExecution(CSVRecord workflowMetricRecord, WorkflowsA * @param skipFutureExecutionsWithSourceUrl boolean indicating if all executions with the same source_url should be skipped */ private void logSkippedExecution(String sourceUrl, CSVRecord csvRecordToSkip, String reason, CSVPrinter skippedExecutionsCsvPrinter, boolean skipFutureExecutionsWithSourceUrl) { - LOG.info("Skipping execution on row {}: {}", csvRecordToSkip.getRecordNumber(), reason); - // Record to map for future reference. Only want to do this if the skip reason applies for ALL executions with the source_url. // Should not add to this map if the skip reason is specific to one execution if (skipFutureExecutionsWithSourceUrl) { @@ -262,7 +329,7 @@ public Optional getTerraWorkflowExecutionFromCsvRecord(CSVRecord c RunExecution workflowExecution = new RunExecution(); // TODO: uncomment below when the update executions endpoint PR is merged - // workflowExecution.setExecutionId(executionId); + workflowExecution.setExecutionId(executionId); workflowExecution.setExecutionStatus(executionStatus.get()); workflowExecution.setDateExecuted(dateExecuted.get()); getExecutionTime(workflowRunTimeMinutes).ifPresent(workflowExecution::setExecutionTime); @@ -379,6 +446,90 @@ private Optional calculateTrsInfoFromSourceUrl(CSVRecord workf } } + /** + * Calculates the TRS ID from the source_url by: + *
    + *
  1. Looking at all published workflows that have the same workflow path prefix, i.e. they belong to the same GitHub repository.
  2. + *
  3. For each published workflow, getting the primary descriptor path for the version specified in source_url and checking if it matches the primary desciptor path in the source_url
  4. + *
  5. Ensuring that there is only one workflow in the repository with the same descriptor path. If there are multiple, it is an ambiguous case and we skip the execution
  6. + *
+ * + * @param sourceUrl + * @param workflowsApi + * @return + */ + private Optional calculateTrsInfoFromSourceUrl(String sourceUrl, WorkflowsApi workflowsApi) { + // Need to figure out the TRS ID and version name using the source_url. + // Example source_url: https://raw.githubusercontent.com/theiagen/public_health_viral_genomics/v2.0.0/workflows/wf_theiacov_fasta.wdl + // Organization = "theiagen/public_health_viral_genomics", version = "v2.0.0", the rest is the primary descriptor path + // Note that the TRS ID may also have a workflow name, which we need to figure out + final List sourceUrlComponents = getSourceUrlComponents(sourceUrl); + + + final int minNumberOfComponents = 3; + if (sourceUrlComponents.size() < minNumberOfComponents) { + //(sourceUrl, workflowMetricRecord, "Not enough components in the source_url to figure out the TRS ID and version", skippedExecutionsCsvPrinter, true); + return Optional.empty(); + } + + // There should be at least three elements in order for there to be an organization name, foo>/, and version + // in /// + final String organization = sourceUrlComponents.get(0) + "/" + sourceUrlComponents.get(1); + final String version = sourceUrlComponents.get(2); + final String primaryDescriptorPathFromUrl = "/" + String.join("/", sourceUrlComponents.subList(3, sourceUrlComponents.size())); + + final String workflowPathPrefix = "github.com/" + organization; + if (!workflowPathPrefixToWorkflows.containsKey(workflowPathPrefix)) { + try { + List publishedWorkflowsWithSamePathPrefix = workflowsApi.getAllPublishedWorkflowByPath( + workflowPathPrefix).stream() + .map(workflow -> new MinimalWorkflowInfo(workflow.getId(), workflow.getFullWorkflowPath(), workflow.getDescriptorType(), new ConcurrentHashMap<>())).toList(); + workflowPathPrefixToWorkflows.put(workflowPathPrefix, publishedWorkflowsWithSamePathPrefix); + } catch (ApiException e) { + /* + logSkippedExecution(sourceUrl, workflowMetricRecord, + "Could not get all published workflows for workflow path " + workflowPathPrefix + " to determine TRS ID", + skippedExecutionsCsvPrinter, true); + */ + return Optional.empty(); + } + } + + List workflowsFromSameRepo = workflowPathPrefixToWorkflows.get(workflowPathPrefix); + + List foundFullWorkflowPaths = new ArrayList<>(); + // Loop through each workflow to find one that matches the primary descriptor + workflowsFromSameRepo.forEach(workflow -> { + // Get the primary descriptor path for the version and update the map, either with the primary descriptor path or an empty string to indicate that it was not found + if (!workflow.versionToPrimaryDescriptorPath().containsKey(version)) { + final String primaryDescriptorAbsolutePath = makePathAbsolute(getPrimaryDescriptorAbsolutePath(workflowsApi, workflow, version).orElse("")); + workflow.versionToPrimaryDescriptorPath().put(version, primaryDescriptorAbsolutePath); + } + + // Check to see if there's a version that has the same primary descriptor path + final String primaryDescriptorPathForVersion = workflow.versionToPrimaryDescriptorPath().get(version); + if (primaryDescriptorPathFromUrl.equals(primaryDescriptorPathForVersion)) { + foundFullWorkflowPaths.add(workflow.fullWorkflowPath()); + } + }); + + if (foundFullWorkflowPaths.isEmpty()) { + //logSkippedExecution(sourceUrl, workflowMetricRecord, "Could not find workflow with primary descriptor " + primaryDescriptorPathFromUrl, skippedExecutionsCsvPrinter, true); + return Optional.empty(); + } else if (foundFullWorkflowPaths.size() > 1) { + // There is already a workflow in the same repository with the same descriptor path that we're looking for. + // Skip this source_url because it is an ambiguous case and we can't identify which workflow the source url is referring to. + /* + logSkippedExecution(sourceUrl, workflowMetricRecord, String.format("There's %s workflows in the repository with the same primary descriptor path '%s': %s", + foundFullWorkflowPaths.size(), primaryDescriptorPathFromUrl, foundFullWorkflowPaths), skippedExecutionsCsvPrinter, true); + */ + return Optional.empty(); + } else { + final SourceUrlTrsInfo sourceUrlTrsInfo = new SourceUrlTrsInfo(sourceUrl, "#workflow/" + foundFullWorkflowPaths.get(0), version); + return Optional.of(sourceUrlTrsInfo); + } + } + /** * Returns a list of slash-delimited components from the source_url. * Example: given source_url https://raw.githubusercontent.com/theiagen/public_health_viral_genomics/v2.0.0/workflows/wf_theiacov_fasta.wdl, From 03a52584857a2a092a459bf0fd664bdeaa4280e4 Mon Sep 17 00:00:00 2001 From: Kathy Tran Date: Mon, 8 Jan 2024 11:20:35 -0500 Subject: [PATCH 09/19] Add test and documentation --- metricsaggregator/README.md | 29 +++++++++++++++- .../client/cli/CommandLineArgs.java | 2 +- .../client/cli/TerraMetricsSubmitter.java | 2 +- .../client/cli/MetricsAggregatorClientIT.java | 33 +++++++++++++++++++ .../src/test/resources/terra-metrics.csv | 6 ++++ 5 files changed, 69 insertions(+), 3 deletions(-) create mode 100644 metricsaggregator/src/test/resources/terra-metrics.csv diff --git a/metricsaggregator/README.md b/metricsaggregator/README.md index d6598f06..febed548 100644 --- a/metricsaggregator/README.md +++ b/metricsaggregator/README.md @@ -79,6 +79,22 @@ Usage:
[options] [command] [command options] Possible Values: [MINIWDL, WOMTOOL, CWLTOOL, NF_VALIDATION, OTHER] * -vv, --validatorVersion The version of the validator tool used to validate the workflows + + submit-terra-metrics Submits workflow metrics provided by Terra via a + CSV file to Dockstore + Usage: submit-terra-metrics [options] + Options: + -c, --config + The config file path. + Default: ./metrics-aggregator.config + * -d, --data + The file path to the CSV file containing workflow metrics from + Terra. The first line of the file should contain the CSV fields: workflow_id,status,workflow_start,workflow_end,workflow_runtime_minutes,source_url + --help + Prints help for metricsaggregator + -r, --recordSkipped + Record skipped executions and the reason skipped to a CSV file + Default: false ``` ### aggregate-metrics @@ -101,4 +117,15 @@ java -jar target/metricsaggregator-*-SNAPSHOT.jar submit-validation-data --confi --data --validator MINIWDL --validatorVersion 1.0 --platform DNA_STACK ``` -After running this command, you will want to run the `aggregate-metrics` command to aggregate the new validation data submitted. \ No newline at end of file +After running this command, you will want to run the `aggregate-metrics` command to aggregate the new validation data submitted. + +### submit-terra-metrics + +The following is an example command that submits metrics from a CSV file that Terra provided, recording the metrics that were skipped into an output CSV file. + +``` +java -jar target/metricsaggregator-*-SNAPSHOT.jar submit-terra-metrics --config my-custom-config \ +--data --recordSkipped +``` + +After running this command, you will want to run the `aggregate-metrics` command to aggregate the new Terra metrics submitted. diff --git a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/CommandLineArgs.java b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/CommandLineArgs.java index 9b78c444..72e52ac6 100644 --- a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/CommandLineArgs.java +++ b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/CommandLineArgs.java @@ -79,7 +79,7 @@ public Partner getPlatform() { } } - @Parameters(commandNames = { "submit-terra-metrics" }, commandDescription = "Formats workflow validation data specified in a file then submits it to Dockstore") + @Parameters(commandNames = { "submit-terra-metrics" }, commandDescription = "Submits workflow metrics provided by Terra via a CSV file to Dockstore") public static class SubmitTerraMetrics extends CommandLineArgs { @Parameter(names = {"-c", "--config"}, description = "The config file path.") private File config = new File("./" + MetricsAggregatorClient.CONFIG_FILE_NAME); diff --git a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java index d2062371..27d4de9a 100644 --- a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java +++ b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java @@ -329,7 +329,7 @@ public Optional getTerraWorkflowExecutionFromCsvRecord(CSVRecord c RunExecution workflowExecution = new RunExecution(); // TODO: uncomment below when the update executions endpoint PR is merged - workflowExecution.setExecutionId(executionId); + //workflowExecution.setExecutionId(executionId); workflowExecution.setExecutionStatus(executionStatus.get()); workflowExecution.setDateExecuted(dateExecuted.get()); getExecutionTime(workflowRunTimeMinutes).ifPresent(workflowExecution::setExecutionTime); diff --git a/metricsaggregator/src/test/java/io/dockstore/metricsaggregator/client/cli/MetricsAggregatorClientIT.java b/metricsaggregator/src/test/java/io/dockstore/metricsaggregator/client/cli/MetricsAggregatorClientIT.java index 5b3ee632..79401c42 100644 --- a/metricsaggregator/src/test/java/io/dockstore/metricsaggregator/client/cli/MetricsAggregatorClientIT.java +++ b/metricsaggregator/src/test/java/io/dockstore/metricsaggregator/client/cli/MetricsAggregatorClientIT.java @@ -24,6 +24,8 @@ import static io.dockstore.metricsaggregator.common.TestUtilities.ENDPOINT_OVERRIDE; import static io.dockstore.metricsaggregator.common.TestUtilities.createRunExecution; import static io.dockstore.metricsaggregator.common.TestUtilities.createValidationExecution; +import static io.dockstore.openapi.client.model.RunExecution.ExecutionStatusEnum.ABORTED; +import static io.dockstore.openapi.client.model.RunExecution.ExecutionStatusEnum.FAILED; import static io.dockstore.openapi.client.model.RunExecution.ExecutionStatusEnum.FAILED_RUNTIME_INVALID; import static io.dockstore.openapi.client.model.RunExecution.ExecutionStatusEnum.FAILED_SEMANTIC_INVALID; import static io.dockstore.openapi.client.model.RunExecution.ExecutionStatusEnum.SUCCESSFUL; @@ -444,4 +446,35 @@ void testSubmitValidationData() throws IOException { assertFalse(validationExecution.isIsValid()); assertEquals(validator, validationExecution.getValidatorTool()); } + + @Test + void testSubmitTerraMetrics() throws IOException { + final ApiClient apiClient = CommonTestUtilities.getOpenAPIWebClient(true, ADMIN_USERNAME, testingPostgres); + final WorkflowsApi workflowsApi = new WorkflowsApi(apiClient); + + Workflow workflow = workflowsApi.getPublishedWorkflow(32L, "metrics"); + WorkflowVersion version = workflow.getWorkflowVersions().stream().filter(v -> "master".equals(v.getName())).findFirst().orElse(null); + assertNotNull(version); + + String id = "#workflow/" + workflow.getFullWorkflowPath(); + String versionId = version.getName(); + // This file contains 3 valid rows of executions: 1 failed, 1 successful, and 1 aborted. + // It also contains 2 invalid rows of executions: 1 where workflow_start is missing, and one where the source_url is invalid + String terraMetricsFilePath = ResourceHelpers.resourceFilePath("terra-metrics.csv"); + + // Submit Terra metrics using a CSV file that metrics of workflows executed on Terra + MetricsAggregatorClient.main(new String[] {"submit-terra-metrics", "--config", CONFIG_FILE_PATH, "--data", terraMetricsFilePath}); + List metricsDataList = metricsDataS3Client.getMetricsData(id, versionId); + assertEquals(1, metricsDataList.size()); // There should be 1 file in S3. + MetricsData metricsData = metricsDataList.get(0); + // Verify the metrics that were sent to S3 + String metricsDataContent = metricsDataS3Client.getMetricsDataFileContent(metricsData.toolId(), metricsData.toolVersionName(), metricsData.platform(), metricsData.fileName()); + ExecutionsRequestBody executionsRequestBody = GSON.fromJson(metricsDataContent, ExecutionsRequestBody.class); + assertTrue(executionsRequestBody.getValidationExecutions().isEmpty(), "Should not have validation executions"); + List terraWorkflowExecutions = executionsRequestBody.getRunExecutions(); + assertEquals(3, terraWorkflowExecutions.size(), "Should have 3 workflow executions submitted"); + assertTrue(terraWorkflowExecutions.stream().anyMatch(execution -> execution.getExecutionStatus() == SUCCESSFUL)); + assertTrue(terraWorkflowExecutions.stream().anyMatch(execution -> execution.getExecutionStatus() == ABORTED)); + assertTrue(terraWorkflowExecutions.stream().anyMatch(execution -> execution.getExecutionStatus() == FAILED)); + } } diff --git a/metricsaggregator/src/test/resources/terra-metrics.csv b/metricsaggregator/src/test/resources/terra-metrics.csv new file mode 100644 index 00000000..dbeaafcc --- /dev/null +++ b/metricsaggregator/src/test/resources/terra-metrics.csv @@ -0,0 +1,6 @@ +workflow_id,status,workflow_start,workflow_end,workflow_runtime_minutes,source_url +d13db165-9289-4ebd-93c7-26db408f84c1,Succeeded,2022-06-30 00:33:44.101000 UTC,2022-06-30 01:25:29.663000 UTC,51,https://raw.githubusercontent.com/dockstore-testing/testWorkflow/master/Dockstore.cwl +59ce788c-acc6-4aaa-b440-560fe614b7a1,Aborted,2022-03-02 20:37:03.245000 UTC,2022-03-02 21:35:16.497000 UTC,58,https://raw.githubusercontent.com/dockstore-testing/testWorkflow/master/Dockstore.cwl +ed4eaac6-cc95-4e8f-9316-335bb3b13592,Failed,2022-07-15 15:37:06.450000 UTC,2022-07-15 16:29:08.951000 UTC,52,https://raw.githubusercontent.com/dockstore-testing/testWorkflow/master/Dockstore.cwl +f3b368f3-2313-4480-98f6-a3bca3d986ef,Aborted,,,58,https://raw.githubusercontent.com/dockstore-testing/testWorkflow/master/Dockstore.cwl +dac1e206-9a33-4d52-baf3-b04258745534,Succeeded,2022-06-30 00:33:44.101000 UTC,2022-06-30 01:25:29.663000 UTC,51,https://raw.githubusercontent.com/doesntexistorganization/doesntexistrepo/master/Dockstore.cwl \ No newline at end of file From b7669a46477d4be99b5f4c2d66b840715a8cce13 Mon Sep 17 00:00:00 2001 From: Kathy Tran Date: Thu, 11 Jan 2024 09:36:56 -0500 Subject: [PATCH 10/19] PR feedback --- THIRD-PARTY-LICENSES.txt | 8 +- .../client/cli/TerraMetricsSubmitter.java | 215 ++++-------------- 2 files changed, 43 insertions(+), 180 deletions(-) diff --git a/THIRD-PARTY-LICENSES.txt b/THIRD-PARTY-LICENSES.txt index 32349b1a..cbd3c455 100644 --- a/THIRD-PARTY-LICENSES.txt +++ b/THIRD-PARTY-LICENSES.txt @@ -10,11 +10,11 @@ Lists of 417 third-party dependencies. (Apache License 2.0) Annotations for Metrics (io.dropwizard.metrics:metrics-annotation:4.2.19 - https://metrics.dropwizard.io/metrics-annotation) (The BSD License) ANTLR 4 Runtime (org.antlr:antlr4-runtime:4.10.1 - http://www.antlr.org/antlr4-runtime) (EPL 2.0) (GPL2 w/ CPE) aopalliance version 1.0 repackaged as a module (org.glassfish.hk2.external:aopalliance-repackaged:3.0.4 - https://github.com/eclipse-ee4j/glassfish-hk2/external/aopalliance-repackaged) - (Apache License, Version 2.0) Apache Avro (org.apache.avro:avro:1.9.1 - https://avro.apache.org) + (Apache-2.0) Apache Avro (org.apache.avro:avro:1.11.3 - https://avro.apache.org) (Apache License, Version 2.0) Apache Commons BeanUtils (commons-beanutils:commons-beanutils:1.9.4 - https://commons.apache.org/proper/commons-beanutils/) (Apache License, Version 2.0) Apache Commons Codec (commons-codec:commons-codec:1.15 - https://commons.apache.org/proper/commons-codec/) (Apache License, Version 2.0) Apache Commons Collections (commons-collections:commons-collections:3.2.2 - http://commons.apache.org/collections/) - (Apache License, Version 2.0) Apache Commons Compress (org.apache.commons:commons-compress:1.21 - https://commons.apache.org/proper/commons-compress/) + (Apache License, Version 2.0) Apache Commons Compress (org.apache.commons:commons-compress:1.22 - https://commons.apache.org/proper/commons-compress/) (Apache License, Version 2.0) (The Apache Software License, Version 2.0) Apache Commons Configuration (org.apache.commons:commons-configuration2:2.8.0 - https://commons.apache.org/proper/commons-configuration/) (Apache-2.0) Apache Commons CSV (org.apache.commons:commons-csv:1.10.0 - https://commons.apache.org/proper/commons-csv/) (Apache License, Version 2.0) Apache Commons Exec (org.apache.commons:commons-exec:1.3 - http://commons.apache.org/proper/commons-exec/) @@ -124,8 +124,8 @@ Lists of 417 third-party dependencies. (Cromwell License https://github.com/broadinstitute/cromwell/blob/develop/LICENSE.txt) cromwell-wdl-transforms-new-base (org.broadinstitute:cromwell-wdl-transforms-new-base_2.13:85 - no url defined) (Cromwell License https://github.com/broadinstitute/cromwell/blob/develop/LICENSE.txt) cromwell-wdl-transforms-shared (org.broadinstitute:cromwell-wdl-transforms-shared_2.13:85 - no url defined) (Cromwell License https://github.com/broadinstitute/cromwell/blob/develop/LICENSE.txt) cromwell-wom (org.broadinstitute:cromwell-wom_2.13:85 - no url defined) - (Apache License, Version 2.0) cwlavro-generated (io.cwl:cwlavro-generated:2.0.4.8 - no url defined) - (Apache License, Version 2.0) cwlavro-tools (io.cwl:cwlavro-tools:2.0.4.8 - no url defined) + (Apache License, Version 2.0) cwlavro-generated (io.cwl:cwlavro-generated:2.0.4.9 - no url defined) + (Apache License, Version 2.0) cwlavro-tools (io.cwl:cwlavro-tools:2.0.4.9 - no url defined) (The Apache Software License, Version 2.0) docker-client (com.spotify:docker-client:8.16.0 - https://github.com/spotify/docker-client) (The Apache Software License, Version 2.0) docker-java-api (com.github.docker-java:docker-java-api:3.3.0 - https://github.com/docker-java/docker-java) (The Apache Software License, Version 2.0) docker-java-core (com.github.docker-java:docker-java-core:3.3.0 - https://github.com/docker-java/docker-java) diff --git a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java index 27d4de9a..27fee9db 100644 --- a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java +++ b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java @@ -36,6 +36,7 @@ import java.util.Map; import java.util.Optional; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; import org.apache.commons.csv.CSVFormat; @@ -64,11 +65,11 @@ public class TerraMetricsSubmitter { private final AtomicInteger numberOfExecutionsSkipped = new AtomicInteger(0); // Keep track of sourceUrls that are found to be ambiguous - private final Map skippedSourceUrlsToReason = new ConcurrentHashMap<>(); + private final ConcurrentMap skippedSourceUrlsToReason = new ConcurrentHashMap<>(); // Map of source url to TRS info that was previously calculated - private final Map sourceUrlToSourceUrlTrsInfo = new ConcurrentHashMap<>(); + private final ConcurrentMap sourceUrlToSourceUrlTrsInfo = new ConcurrentHashMap<>(); // Map of workflow path prefixes, like github.com/organization/repo, to published workflows with the same workflow path prefix - private final Map> workflowPathPrefixToWorkflows = new ConcurrentHashMap<>(); + private final ConcurrentMap> workflowPathPrefixToWorkflows = new ConcurrentHashMap<>(); public TerraMetricsSubmitter(MetricsAggregatorConfig config, SubmitTerraMetrics submitTerraMetricsCommand) { this.config = config; @@ -100,25 +101,16 @@ public void submitTerraMetrics() { CSVFormat.DEFAULT.builder().setHeader(SkippedTerraMetricsCsvHeaders.class).build()) : null) { // Process the executions by reading 100,000 rows, then processing the rows in parallel + final int batchSize = 100000; for (CSVRecord workflowMetricRecord: workflowMetricRecords) { - final int batchSize = 100000; if (workflowMetricsToProcess.size() < batchSize && workflowMetricRecords.iterator().hasNext()) { workflowMetricsToProcess.add(workflowMetricRecord); } else { workflowMetricsToProcess.add(workflowMetricRecord); + LOG.info("Processing rows {} to {}", workflowMetricsToProcess.get(0).getRecordNumber(), workflowMetricsToProcess.get(workflowMetricsToProcess.size() - 1).getRecordNumber()); // Collect a map of CSV records with the same source URL Map> sourceUrlToCsvRecords = workflowMetricsToProcess.stream() .collect(groupingBy(csvRecord -> csvRecord.get(TerraMetricsCsvHeaders.source_url))); - /* - for (CSVRecord workflowMetricToProcess: workflowMetricsToProcess) { - final String csvRecordSourceUrl = workflowMetricToProcess.get(TerraMetricsCsvHeaders.source_url); - List csvRecordsWithSameSourceUrl = sourceUrlToCsvRecords.getOrDefault(csvRecordSourceUrl, new ArrayList<>()); - csvRecordsWithSameSourceUrl.add(workflowMetricToProcess); - sourceUrlToCsvRecords.put(csvRecordSourceUrl, csvRecordsWithSameSourceUrl); - } - */ - - LOG.info("Processing rows {} to {}", workflowMetricsToProcess.get(0).getRecordNumber(), workflowMetricsToProcess.get(workflowMetricsToProcess.size() - 1).getRecordNumber()); sourceUrlToCsvRecords.entrySet().stream() .parallel() @@ -152,81 +144,23 @@ private void logStats() { LOG.info("Done processing {} executions from Terra. Submitted {} executions. Skipped {} executions.", numberOfExecutionsProcessed, numberOfExecutionsSubmitted, numberOfExecutionsSkipped); } - /** - * Processes the workflow execution from the CSV record and submits the execution to Dockstore if it is valid. - * @param workflowMetricRecord - * @param workflowsApi - * @param extendedGa4GhApi - * @param skippedExecutionsCsvPrinter - */ - private void processWorkflowExecution(CSVRecord workflowMetricRecord, WorkflowsApi workflowsApi, ExtendedGa4GhApi extendedGa4GhApi, CSVPrinter skippedExecutionsCsvPrinter) { - final String sourceUrl = workflowMetricRecord.get(TerraMetricsCsvHeaders.source_url); - LOG.info("Processing execution on row {} with source_url {}", workflowMetricRecord.getRecordNumber(), sourceUrl); - numberOfExecutionsProcessed.incrementAndGet(); - - if (StringUtils.isBlank(sourceUrl)) { - logSkippedExecution("", workflowMetricRecord, "Can't determine TRS ID because source_url is missing", skippedExecutionsCsvPrinter, false); - } - - // Check to see if this source_url was skipped before - if (skippedSourceUrlsToReason.containsKey(sourceUrl)) { - logSkippedExecution(sourceUrl, workflowMetricRecord, skippedSourceUrlsToReason.get(sourceUrl), skippedExecutionsCsvPrinter, true); - return; - } - - if (!sourceUrlToSourceUrlTrsInfo.containsKey(sourceUrl)) { - LOG.info("Calculating TRS info for source_url {}", sourceUrl); - Optional sourceUrlTrsInfo = calculateTrsInfoFromSourceUrl(workflowMetricRecord, sourceUrl, workflowsApi, skippedExecutionsCsvPrinter); - if (sourceUrlTrsInfo.isEmpty()) { - return; - } else { - sourceUrlToSourceUrlTrsInfo.put(sourceUrl, sourceUrlTrsInfo.get()); - } - } else { - LOG.info("TRS info for source_url {} was previously calculated", sourceUrl); - } - - // Check if the information from the workflow execution is valid - Optional workflowExecution = getTerraWorkflowExecutionFromCsvRecord(workflowMetricRecord, sourceUrl, skippedExecutionsCsvPrinter); - if (workflowExecution.isEmpty()) { - return; - } - - final SourceUrlTrsInfo sourceUrlTrsInfo = sourceUrlToSourceUrlTrsInfo.get(sourceUrl); - final ExecutionsRequestBody executionsRequestBody = new ExecutionsRequestBody().runExecutions(List.of(workflowExecution.get())); - try { - extendedGa4GhApi.executionMetricsPost(executionsRequestBody, Partner.TERRA.toString(), sourceUrlTrsInfo.trsId(), sourceUrlTrsInfo.version(), - "Terra metrics from Q4 2023"); - numberOfExecutionsSubmitted.incrementAndGet(); - } catch (ApiException e) { - logSkippedExecution(sourceUrl, workflowMetricRecord, String.format("Could not submit execution metrics to Dockstore for workflow %s: %s", sourceUrlTrsInfo, e.getMessage()), skippedExecutionsCsvPrinter, false); - } - } - private void submitWorkflowExecutions(String sourceUrl, List workflowMetricRecords, WorkflowsApi workflowsApi, ExtendedGa4GhApi extendedGa4GhApi, CSVPrinter skippedExecutionsCsvPrinter) { LOG.info("Processing source_url {} for {} executions", sourceUrl, workflowMetricRecords.size()); numberOfExecutionsProcessed.addAndGet(workflowMetricRecords.size()); if (StringUtils.isBlank(sourceUrl)) { - workflowMetricRecords.forEach(workflowMetricRecord -> { - logSkippedExecution("", workflowMetricRecord, "Can't determine TRS ID because source_url is missing", skippedExecutionsCsvPrinter, false); - }); + logSkippedExecutions("", workflowMetricRecords, "Can't determine TRS ID because source_url is missing", skippedExecutionsCsvPrinter, false); } // Check to see if this source_url was skipped before if (skippedSourceUrlsToReason.containsKey(sourceUrl)) { - workflowMetricRecords.forEach(workflowMetricRecord -> { - logSkippedExecution(sourceUrl, workflowMetricRecord, skippedSourceUrlsToReason.get(sourceUrl), skippedExecutionsCsvPrinter, true); - }); + logSkippedExecutions(sourceUrl, workflowMetricRecords, skippedSourceUrlsToReason.get(sourceUrl), skippedExecutionsCsvPrinter, true); return; } if (!sourceUrlToSourceUrlTrsInfo.containsKey(sourceUrl)) { - Optional sourceUrlTrsInfo = calculateTrsInfoFromSourceUrl(sourceUrl, workflowsApi); + Optional sourceUrlTrsInfo = calculateTrsInfoFromSourceUrl(workflowMetricRecords, sourceUrl, workflowsApi, skippedExecutionsCsvPrinter); if (sourceUrlTrsInfo.isEmpty()) { - workflowMetricRecords.forEach(workflowMetricRecord -> { - logSkippedExecution(sourceUrl, workflowMetricRecord, "Could not calculate TRS info", skippedExecutionsCsvPrinter, true); - }); return; } else { sourceUrlToSourceUrlTrsInfo.put(sourceUrl, sourceUrlTrsInfo.get()); @@ -245,9 +179,7 @@ private void submitWorkflowExecutions(String sourceUrl, List workflow "Terra metrics from Q4 2023"); numberOfExecutionsSubmitted.addAndGet(workflowMetricRecords.size()); } catch (ApiException e) { - workflowMetricRecords.forEach(workflowMetricRecord -> { - logSkippedExecution(sourceUrlTrsInfo.sourceUrl(), workflowMetricRecord, String.format("Could not submit execution metrics to Dockstore for workflow %s: %s", sourceUrlTrsInfo, e.getMessage()), skippedExecutionsCsvPrinter, false); - }); + logSkippedExecutions(sourceUrlTrsInfo.sourceUrl(), workflowMetricRecords, String.format("Could not submit execution metrics to Dockstore for workflow %s: %s", sourceUrlTrsInfo, e.getMessage()), skippedExecutionsCsvPrinter, false); } } @@ -283,6 +215,20 @@ private void logSkippedExecution(String sourceUrl, CSVRecord csvRecordToSkip, St numberOfExecutionsSkipped.incrementAndGet(); } + /** + * Performs logging and writing of the skipped executions with the same sourceUrl to an output file. Assumes that all executions are skipped for the same reason. + * If skipFutureExecutionsWithSourceUrl is true, also adds the source_url of the skipped execution to the sourceUrlsToSkipToReason map + * so that future executions with the same source_url are skipped. + * @param sourceUrl sourceUrl of all csvRecordsToSkip + * @param csvRecordsToSkip the CSVRecords to skip + * @param reason the reason the CSVRecords are being skipped + * @param skippedExecutionsCsvPrinter CSVPrinter that writes the skipped reason and records to an output file + * @param skipFutureExecutionsWithSourceUrl boolean indicating if all executions with the same source_url should be skipped + */ + private void logSkippedExecutions(String sourceUrl, List csvRecordsToSkip, String reason, CSVPrinter skippedExecutionsCsvPrinter, boolean skipFutureExecutionsWithSourceUrl) { + csvRecordsToSkip.forEach(csvRecordToSkip -> logSkippedExecution(sourceUrl, csvRecordToSkip, reason, skippedExecutionsCsvPrinter, skipFutureExecutionsWithSourceUrl)); + } + /** * Gets a RunExecution representing a single Terra workflow execution from one row of the CSV file. * If the CSV record is invalid, the function will record the reason why the execution was skipped using skippedExecutionsCsvPrinter. @@ -367,98 +313,19 @@ static Optional getExecutionTime(String workflowRunTimeMinutes) { * Calculates the TRS ID from the source_url by: *
    *
  1. Looking at all published workflows that have the same workflow path prefix, i.e. they belong to the same GitHub repository.
  2. - *
  3. For each published workflow, getting the primary descriptor path for the version specified in source_url and checking if it matches the primary desciptor path in the source_url
  4. + *
  5. For each published workflow, getting the primary descriptor path for the version specified in source_url and checking if it matches the primary descriptor path in the source_url
  6. *
  7. Ensuring that there is only one workflow in the repository with the same descriptor path. If there are multiple, it is an ambiguous case and we skip the execution
  8. *
* - * @param workflowMetricRecord - * @param sourceUrl - * @param workflowsApi - * @param skippedExecutionsCsvPrinter - * @return - */ - private Optional calculateTrsInfoFromSourceUrl(CSVRecord workflowMetricRecord, String sourceUrl, WorkflowsApi workflowsApi, CSVPrinter skippedExecutionsCsvPrinter) { - // Need to figure out the TRS ID and version name using the source_url. - // Example source_url: https://raw.githubusercontent.com/theiagen/public_health_viral_genomics/v2.0.0/workflows/wf_theiacov_fasta.wdl - // Organization = "theiagen/public_health_viral_genomics", version = "v2.0.0", the rest is the primary descriptor path - // Note that the TRS ID may also have a workflow name, which we need to figure out - final List sourceUrlComponents = getSourceUrlComponents(sourceUrl); - - - final int minNumberOfComponents = 3; - if (sourceUrlComponents.size() < minNumberOfComponents) { - logSkippedExecution(sourceUrl, workflowMetricRecord, "Not enough components in the source_url to figure out the TRS ID and version", skippedExecutionsCsvPrinter, true); - return Optional.empty(); - } - - // There should be at least three elements in order for there to be an organization name, foo>/, and version - // in /// - final String organization = sourceUrlComponents.get(0) + "/" + sourceUrlComponents.get(1); - final String version = sourceUrlComponents.get(2); - final String primaryDescriptorPathFromUrl = "/" + String.join("/", sourceUrlComponents.subList(3, sourceUrlComponents.size())); - - final String workflowPathPrefix = "github.com/" + organization; - if (!workflowPathPrefixToWorkflows.containsKey(workflowPathPrefix)) { - try { - List publishedWorkflowsWithSamePathPrefix = workflowsApi.getAllPublishedWorkflowByPath( - workflowPathPrefix).stream() - .map(workflow -> new MinimalWorkflowInfo(workflow.getId(), workflow.getFullWorkflowPath(), workflow.getDescriptorType(), new ConcurrentHashMap<>())).toList(); - workflowPathPrefixToWorkflows.put(workflowPathPrefix, publishedWorkflowsWithSamePathPrefix); - } catch (ApiException e) { - logSkippedExecution(sourceUrl, workflowMetricRecord, - "Could not get all published workflows for workflow path " + workflowPathPrefix + " to determine TRS ID", - skippedExecutionsCsvPrinter, true); - return Optional.empty(); - } - } - - List workflowsFromSameRepo = workflowPathPrefixToWorkflows.get(workflowPathPrefix); - - List foundFullWorkflowPaths = new ArrayList<>(); - // Loop through each workflow to find one that matches the primary descriptor - workflowsFromSameRepo.forEach(workflow -> { - // Get the primary descriptor path for the version and update the map, either with the primary descriptor path or an empty string to indicate that it was not found - if (!workflow.versionToPrimaryDescriptorPath().containsKey(version)) { - final String primaryDescriptorAbsolutePath = makePathAbsolute(getPrimaryDescriptorAbsolutePath(workflowsApi, workflow, version).orElse("")); - workflow.versionToPrimaryDescriptorPath().put(version, primaryDescriptorAbsolutePath); - } - - // Check to see if there's a version that has the same primary descriptor path - final String primaryDescriptorPathForVersion = workflow.versionToPrimaryDescriptorPath().get(version); - if (primaryDescriptorPathFromUrl.equals(primaryDescriptorPathForVersion)) { - foundFullWorkflowPaths.add(workflow.fullWorkflowPath()); - } - }); - - if (foundFullWorkflowPaths.isEmpty()) { - logSkippedExecution(sourceUrl, workflowMetricRecord, "Could not find workflow with primary descriptor " + primaryDescriptorPathFromUrl, skippedExecutionsCsvPrinter, - true); - return Optional.empty(); - } else if (foundFullWorkflowPaths.size() > 1) { - // There is already a workflow in the same repository with the same descriptor path that we're looking for. - // Skip this source_url because it is an ambiguous case and we can't identify which workflow the source url is referring to. - logSkippedExecution(sourceUrl, workflowMetricRecord, String.format("There's %s workflows in the repository with the same primary descriptor path '%s': %s", - foundFullWorkflowPaths.size(), primaryDescriptorPathFromUrl, foundFullWorkflowPaths), skippedExecutionsCsvPrinter, true); - return Optional.empty(); - } else { - final SourceUrlTrsInfo sourceUrlTrsInfo = new SourceUrlTrsInfo(sourceUrl, "#workflow/" + foundFullWorkflowPaths.get(0), version); - return Optional.of(sourceUrlTrsInfo); - } - } - - /** - * Calculates the TRS ID from the source_url by: - *
    - *
  1. Looking at all published workflows that have the same workflow path prefix, i.e. they belong to the same GitHub repository.
  2. - *
  3. For each published workflow, getting the primary descriptor path for the version specified in source_url and checking if it matches the primary desciptor path in the source_url
  4. - *
  5. Ensuring that there is only one workflow in the repository with the same descriptor path. If there are multiple, it is an ambiguous case and we skip the execution
  6. - *
+ * Also writes skipped executions to an output file. * - * @param sourceUrl - * @param workflowsApi + * @param workflowMetricRecords workflow CSV records, all with the same sourceUrl, to calculate the TRS info for + * @param sourceUrl the sourceUrl of all the workflow CSV records + * @param workflowsApi workflowsApi used to help calculate the TRS info + * @param skippedExecutionsCsvPrinter If the workflow CSV records are skipped, the CSV Printer that writes the reason why it was skipped and the records to an output file * @return */ - private Optional calculateTrsInfoFromSourceUrl(String sourceUrl, WorkflowsApi workflowsApi) { + private Optional calculateTrsInfoFromSourceUrl(List workflowMetricRecords, String sourceUrl, WorkflowsApi workflowsApi, CSVPrinter skippedExecutionsCsvPrinter) { // Need to figure out the TRS ID and version name using the source_url. // Example source_url: https://raw.githubusercontent.com/theiagen/public_health_viral_genomics/v2.0.0/workflows/wf_theiacov_fasta.wdl // Organization = "theiagen/public_health_viral_genomics", version = "v2.0.0", the rest is the primary descriptor path @@ -468,7 +335,7 @@ private Optional calculateTrsInfoFromSourceUrl(String sourceUr final int minNumberOfComponents = 3; if (sourceUrlComponents.size() < minNumberOfComponents) { - //(sourceUrl, workflowMetricRecord, "Not enough components in the source_url to figure out the TRS ID and version", skippedExecutionsCsvPrinter, true); + logSkippedExecutions(sourceUrl, workflowMetricRecords, "Not enough components in the source_url to figure out the TRS ID and version", skippedExecutionsCsvPrinter, true); return Optional.empty(); } @@ -486,11 +353,9 @@ private Optional calculateTrsInfoFromSourceUrl(String sourceUr .map(workflow -> new MinimalWorkflowInfo(workflow.getId(), workflow.getFullWorkflowPath(), workflow.getDescriptorType(), new ConcurrentHashMap<>())).toList(); workflowPathPrefixToWorkflows.put(workflowPathPrefix, publishedWorkflowsWithSamePathPrefix); } catch (ApiException e) { - /* - logSkippedExecution(sourceUrl, workflowMetricRecord, + logSkippedExecutions(sourceUrl, workflowMetricRecords, "Could not get all published workflows for workflow path " + workflowPathPrefix + " to determine TRS ID", skippedExecutionsCsvPrinter, true); - */ return Optional.empty(); } } @@ -501,28 +366,26 @@ private Optional calculateTrsInfoFromSourceUrl(String sourceUr // Loop through each workflow to find one that matches the primary descriptor workflowsFromSameRepo.forEach(workflow -> { // Get the primary descriptor path for the version and update the map, either with the primary descriptor path or an empty string to indicate that it was not found - if (!workflow.versionToPrimaryDescriptorPath().containsKey(version)) { + if (!workflow.versionToPrimaryDescriptorPathMap().containsKey(version)) { final String primaryDescriptorAbsolutePath = makePathAbsolute(getPrimaryDescriptorAbsolutePath(workflowsApi, workflow, version).orElse("")); - workflow.versionToPrimaryDescriptorPath().put(version, primaryDescriptorAbsolutePath); + workflow.versionToPrimaryDescriptorPathMap().put(version, primaryDescriptorAbsolutePath); } // Check to see if there's a version that has the same primary descriptor path - final String primaryDescriptorPathForVersion = workflow.versionToPrimaryDescriptorPath().get(version); + final String primaryDescriptorPathForVersion = workflow.versionToPrimaryDescriptorPathMap().get(version); if (primaryDescriptorPathFromUrl.equals(primaryDescriptorPathForVersion)) { foundFullWorkflowPaths.add(workflow.fullWorkflowPath()); } }); if (foundFullWorkflowPaths.isEmpty()) { - //logSkippedExecution(sourceUrl, workflowMetricRecord, "Could not find workflow with primary descriptor " + primaryDescriptorPathFromUrl, skippedExecutionsCsvPrinter, true); + logSkippedExecutions(sourceUrl, workflowMetricRecords, "Could not find workflow with primary descriptor " + primaryDescriptorPathFromUrl, skippedExecutionsCsvPrinter, true); return Optional.empty(); } else if (foundFullWorkflowPaths.size() > 1) { // There is already a workflow in the same repository with the same descriptor path that we're looking for. // Skip this source_url because it is an ambiguous case and we can't identify which workflow the source url is referring to. - /* - logSkippedExecution(sourceUrl, workflowMetricRecord, String.format("There's %s workflows in the repository with the same primary descriptor path '%s': %s", + logSkippedExecutions(sourceUrl, workflowMetricRecords, String.format("There's %s workflows in the repository with the same primary descriptor path '%s': %s", foundFullWorkflowPaths.size(), primaryDescriptorPathFromUrl, foundFullWorkflowPaths), skippedExecutionsCsvPrinter, true); - */ return Optional.empty(); } else { final SourceUrlTrsInfo sourceUrlTrsInfo = new SourceUrlTrsInfo(sourceUrl, "#workflow/" + foundFullWorkflowPaths.get(0), version); @@ -567,6 +430,6 @@ static String makePathAbsolute(String path) { public record SourceUrlTrsInfo(String sourceUrl, String trsId, String version) { } - public record MinimalWorkflowInfo(long id, String fullWorkflowPath, DescriptorTypeEnum descriptorType, ConcurrentHashMap versionToPrimaryDescriptorPath) { + public record MinimalWorkflowInfo(long id, String fullWorkflowPath, DescriptorTypeEnum descriptorType, ConcurrentMap versionToPrimaryDescriptorPathMap) { } } From 3580491123913c0b8835f38bad6a6b3a6ea8d1c3 Mon Sep 17 00:00:00 2001 From: Kathy Tran Date: Thu, 11 Jan 2024 10:02:30 -0500 Subject: [PATCH 11/19] Only display time elapsed if there was a command --- .../client/cli/MetricsAggregatorClient.java | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/MetricsAggregatorClient.java b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/MetricsAggregatorClient.java index 7aeaafe5..d4d4af41 100644 --- a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/MetricsAggregatorClient.java +++ b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/MetricsAggregatorClient.java @@ -150,8 +150,11 @@ public static void main(String[] args) { } } } - final Instant endTime = Instant.now(); - LOG.info("{} took {}", jCommander.getParsedCommand(), Duration.between(startTime, endTime)); + + if (jCommander.getParsedCommand() != null) { + final Instant endTime = Instant.now(); + LOG.info("{} took {}", jCommander.getParsedCommand(), Duration.between(startTime, endTime)); + } } private void aggregateMetrics(MetricsAggregatorConfig config) throws URISyntaxException { From e10f6683592cae1baed3a3091d69439c8d2b2488 Mon Sep 17 00:00:00 2001 From: Kathy Tran Date: Thu, 11 Jan 2024 11:09:17 -0500 Subject: [PATCH 12/19] Use AtomicInteger for metricsaggregator --- .../MetricsAggregatorS3Client.java | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/MetricsAggregatorS3Client.java b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/MetricsAggregatorS3Client.java index f660de51..7b7721a7 100644 --- a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/MetricsAggregatorS3Client.java +++ b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/MetricsAggregatorS3Client.java @@ -37,6 +37,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Queue; +import java.util.concurrent.atomic.AtomicInteger; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import software.amazon.awssdk.services.s3.S3Client; @@ -48,9 +49,9 @@ public class MetricsAggregatorS3Client { private static final Logger LOG = LoggerFactory.getLogger(MetricsAggregatorS3Client.class); private static final Gson GSON = new Gson(); - private int numberOfDirectoriesProcessed = 0; - private int numberOfMetricsSubmitted = 0; - private int numberOfMetricsSkipped = 0; + private final AtomicInteger numberOfDirectoriesProcessed = new AtomicInteger(0); + private final AtomicInteger numberOfMetricsSubmitted = new AtomicInteger(0); + private final AtomicInteger numberOfMetricsSkipped = new AtomicInteger(0); private final String bucketName; @@ -82,7 +83,7 @@ public void aggregateMetrics(ExtendedGa4GhApi extendedGa4GhApi) { metricsDirectories.stream() .parallel() .forEach(directoryInfo -> aggregateMetricsForDirectory(directoryInfo, extendedGa4GhApi)); - LOG.info("Completed aggregating metrics. Processed {} directories, submitted {} metrics, and skipped {} metrics", numberOfDirectoriesProcessed, numberOfMetricsSubmitted, numberOfMetricsSkipped); + LOG.info("Completed aggregating metrics. Processed {} directories, submitted {} platform metrics, and skipped {} platform metrics", numberOfDirectoriesProcessed, numberOfMetricsSubmitted, numberOfMetricsSkipped); } private void aggregateMetricsForDirectory(S3DirectoryInfo directoryInfo, ExtendedGa4GhApi extendedGa4GhApi) { @@ -101,18 +102,20 @@ private void aggregateMetricsForDirectory(S3DirectoryInfo directoryInfo, Extende allSubmissions = getExecutions(toolId, versionName, platform); } catch (Exception e) { LOG.error("Error aggregating metrics: Could not get all executions from directory {}", versionS3KeyPrefix, e); - ++numberOfMetricsSkipped; + numberOfMetricsSkipped.incrementAndGet(); continue; // Continue aggregating metrics for other directories } try { getAggregatedMetrics(allSubmissions).ifPresent(metrics -> { extendedGa4GhApi.aggregatedMetricsPut(metrics, platform, toolId, versionName); - System.out.printf("Aggregated metrics for tool ID %s, version %s, platform %s from directory %s%n", toolId, versionName, platform, versionS3KeyPrefix); + LOG.info("Aggregated metrics for tool ID {}, version {}, platform {} from directory {}", toolId, versionName, platform, versionS3KeyPrefix); allMetrics.add(metrics); + numberOfMetricsSubmitted.incrementAndGet(); }); } catch (Exception e) { LOG.error("Error aggregating metrics: Could not put all executions from directory {}", versionS3KeyPrefix, e); + numberOfMetricsSkipped.incrementAndGet(); // Continue aggregating metrics for other platforms } } @@ -122,19 +125,20 @@ private void aggregateMetricsForDirectory(S3DirectoryInfo directoryInfo, Extende try { getAggregatedMetrics(new ExecutionsRequestBody().aggregatedExecutions(allMetrics)).ifPresent(metrics -> { extendedGa4GhApi.aggregatedMetricsPut(metrics, Partner.ALL.name(), toolId, versionName); - System.out.printf("Aggregated metrics across all platforms (%s) for tool ID %s, version %s from directory %s%n", + LOG.info("Aggregated metrics across all platforms ({}) for tool ID {}, version {} from directory {}", platformsString, toolId, versionName, versionS3KeyPrefix); allMetrics.add(metrics); + numberOfMetricsSubmitted.incrementAndGet(); }); } catch (Exception e) { LOG.error("Error aggregating metrics across all platforms ({}) for tool ID {}, version {} from directory {}", platformsString, toolId, versionName, versionS3KeyPrefix, e); + numberOfMetricsSkipped.incrementAndGet(); // Continue aggregating metrics for other directories } - ++numberOfMetricsSubmitted; } else { - ++numberOfMetricsSkipped; + numberOfMetricsSkipped.incrementAndGet(); } - ++numberOfDirectoriesProcessed; + numberOfDirectoriesProcessed.incrementAndGet(); LOG.info("Processed {} directories", numberOfDirectoriesProcessed); } From 3df55f74b983808eecef678e57c3c1c39a5ab379 Mon Sep 17 00:00:00 2001 From: Kathy Tran Date: Mon, 15 Jan 2024 10:33:00 -0500 Subject: [PATCH 13/19] Move tooltester ExceptionHandler to utils package --- tooltester/pom.xml | 5 +++++ .../io/dockstore/tooltester/client/cli/Client.java | 6 +++--- .../tooltester/runWorkflow/WorkflowRunner.java | 10 +++++----- .../tooltester/runWorkflow/WorkflowRunnerConfig.java | 6 +++--- .../io/dockstore/tooltester/client/cli/ClientTest.java | 2 +- .../java/io/dockstore/utils}/ExceptionHandler.java | 2 +- 6 files changed, 18 insertions(+), 13 deletions(-) rename {tooltester/src/main/java/io/dockstore/tooltester/helper => utils/src/main/java/io/dockstore/utils}/ExceptionHandler.java (97%) diff --git a/tooltester/pom.xml b/tooltester/pom.xml index 4030343a..90efd0c0 100644 --- a/tooltester/pom.xml +++ b/tooltester/pom.xml @@ -114,6 +114,11 @@ + + io.dockstore + utils + ${revision}${changelist} + com.fasterxml.jackson.jaxrs jackson-jaxrs-base diff --git a/tooltester/src/main/java/io/dockstore/tooltester/client/cli/Client.java b/tooltester/src/main/java/io/dockstore/tooltester/client/cli/Client.java index 26879e3b..f4c8ba27 100644 --- a/tooltester/src/main/java/io/dockstore/tooltester/client/cli/Client.java +++ b/tooltester/src/main/java/io/dockstore/tooltester/client/cli/Client.java @@ -20,12 +20,12 @@ import static io.dockstore.common.S3ClientHelper.getVersionName; import static io.dockstore.tooltester.client.cli.JCommanderUtility.out; import static io.dockstore.tooltester.client.cli.JCommanderUtility.printJCommanderHelp; -import static io.dockstore.tooltester.helper.ExceptionHandler.COMMAND_ERROR; -import static io.dockstore.tooltester.helper.ExceptionHandler.DEBUG; -import static io.dockstore.tooltester.helper.ExceptionHandler.exceptionMessage; import static io.dockstore.tooltester.runWorkflow.WorkflowRunner.GSON; import static io.dockstore.tooltester.runWorkflow.WorkflowRunner.printLine; import static io.dockstore.tooltester.runWorkflow.WorkflowRunner.uploadRunInfo; +import static io.dockstore.utils.ExceptionHandler.COMMAND_ERROR; +import static io.dockstore.utils.ExceptionHandler.DEBUG; +import static io.dockstore.utils.ExceptionHandler.exceptionMessage; import com.beust.jcommander.JCommander; import com.beust.jcommander.MissingCommandException; diff --git a/tooltester/src/main/java/io/dockstore/tooltester/runWorkflow/WorkflowRunner.java b/tooltester/src/main/java/io/dockstore/tooltester/runWorkflow/WorkflowRunner.java index 50a237cf..e1af816e 100644 --- a/tooltester/src/main/java/io/dockstore/tooltester/runWorkflow/WorkflowRunner.java +++ b/tooltester/src/main/java/io/dockstore/tooltester/runWorkflow/WorkflowRunner.java @@ -19,11 +19,11 @@ import static io.dockstore.common.S3ClientHelper.createFileName; import static io.dockstore.common.metrics.MetricsDataS3Client.generateKey; import static io.dockstore.tooltester.client.cli.JCommanderUtility.out; -import static io.dockstore.tooltester.helper.ExceptionHandler.API_ERROR; -import static io.dockstore.tooltester.helper.ExceptionHandler.COMMAND_ERROR; -import static io.dockstore.tooltester.helper.ExceptionHandler.GENERIC_ERROR; -import static io.dockstore.tooltester.helper.ExceptionHandler.errorMessage; -import static io.dockstore.tooltester.helper.ExceptionHandler.exceptionMessage; +import static io.dockstore.utils.ExceptionHandler.API_ERROR; +import static io.dockstore.utils.ExceptionHandler.COMMAND_ERROR; +import static io.dockstore.utils.ExceptionHandler.GENERIC_ERROR; +import static io.dockstore.utils.ExceptionHandler.errorMessage; +import static io.dockstore.utils.ExceptionHandler.exceptionMessage; import static java.util.UUID.randomUUID; import static org.apache.commons.lang3.math.NumberUtils.max; import static org.apache.commons.lang3.math.NumberUtils.min; diff --git a/tooltester/src/main/java/io/dockstore/tooltester/runWorkflow/WorkflowRunnerConfig.java b/tooltester/src/main/java/io/dockstore/tooltester/runWorkflow/WorkflowRunnerConfig.java index 8c2a48c1..74a5c789 100644 --- a/tooltester/src/main/java/io/dockstore/tooltester/runWorkflow/WorkflowRunnerConfig.java +++ b/tooltester/src/main/java/io/dockstore/tooltester/runWorkflow/WorkflowRunnerConfig.java @@ -1,8 +1,8 @@ package io.dockstore.tooltester.runWorkflow; -import static io.dockstore.tooltester.helper.ExceptionHandler.COMMAND_ERROR; -import static io.dockstore.tooltester.helper.ExceptionHandler.IO_ERROR; -import static io.dockstore.tooltester.helper.ExceptionHandler.exceptionMessage; +import static io.dockstore.utils.ExceptionHandler.COMMAND_ERROR; +import static io.dockstore.utils.ExceptionHandler.IO_ERROR; +import static io.dockstore.utils.ExceptionHandler.exceptionMessage; import static java.util.UUID.randomUUID; import java.io.BufferedWriter; diff --git a/tooltester/src/test/java/io/dockstore/tooltester/client/cli/ClientTest.java b/tooltester/src/test/java/io/dockstore/tooltester/client/cli/ClientTest.java index 2e221c68..dc032a46 100644 --- a/tooltester/src/test/java/io/dockstore/tooltester/client/cli/ClientTest.java +++ b/tooltester/src/test/java/io/dockstore/tooltester/client/cli/ClientTest.java @@ -1,7 +1,7 @@ package io.dockstore.tooltester.client.cli; import static io.dockstore.tooltester.client.cli.Client.main; -import static io.dockstore.tooltester.helper.ExceptionHandler.COMMAND_ERROR; +import static io.dockstore.utils.ExceptionHandler.COMMAND_ERROR; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; import static uk.org.webcompere.systemstubs.SystemStubs.catchSystemExit; diff --git a/tooltester/src/main/java/io/dockstore/tooltester/helper/ExceptionHandler.java b/utils/src/main/java/io/dockstore/utils/ExceptionHandler.java similarity index 97% rename from tooltester/src/main/java/io/dockstore/tooltester/helper/ExceptionHandler.java rename to utils/src/main/java/io/dockstore/utils/ExceptionHandler.java index b243ab07..a9088ba5 100644 --- a/tooltester/src/main/java/io/dockstore/tooltester/helper/ExceptionHandler.java +++ b/utils/src/main/java/io/dockstore/utils/ExceptionHandler.java @@ -1,4 +1,4 @@ -package io.dockstore.tooltester.helper; +package io.dockstore.utils; import java.util.concurrent.atomic.AtomicBoolean; import org.slf4j.Logger; From c24c223fed320b6a886d7e798ccec8402a8bbe0d Mon Sep 17 00:00:00 2001 From: Kathy Tran Date: Mon, 15 Jan 2024 11:07:09 -0500 Subject: [PATCH 14/19] Only call System.exit() from ExceptionHandler --- .../client/cli/MetricsAggregatorClient.java | 43 +++++++------------ .../client/cli/TerraMetricsSubmitter.java | 9 ++-- .../client/cli/MetricsAggregatorClientIT.java | 4 +- .../common/TestUtilities.java | 10 +---- .../client/cli/TopicGeneratorClient.java | 28 +++++------- .../java/io/dockstore/utils/CLIConstants.java | 9 ---- .../io/dockstore/utils/ConfigFileUtils.java | 14 +++--- 7 files changed, 42 insertions(+), 75 deletions(-) delete mode 100644 utils/src/main/java/io/dockstore/utils/CLIConstants.java diff --git a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/MetricsAggregatorClient.java b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/MetricsAggregatorClient.java index d4d4af41..e273f495 100644 --- a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/MetricsAggregatorClient.java +++ b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/MetricsAggregatorClient.java @@ -17,9 +17,10 @@ package io.dockstore.metricsaggregator.client.cli; -import static io.dockstore.utils.CLIConstants.FAILURE_EXIT_CODE; import static io.dockstore.utils.ConfigFileUtils.getConfiguration; import static io.dockstore.utils.DockstoreApiClientUtils.setupApiClient; +import static io.dockstore.utils.ExceptionHandler.GENERIC_ERROR; +import static io.dockstore.utils.ExceptionHandler.exceptionMessage; import com.beust.jcommander.JCommander; import com.beust.jcommander.MissingCommandException; @@ -42,7 +43,6 @@ import java.time.Duration; import java.time.Instant; import java.util.List; -import java.util.Optional; import org.apache.commons.configuration2.INIConfiguration; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -81,15 +81,14 @@ public static void main(String[] args) { } catch (MissingCommandException e) { jCommander.usage(); if (e.getUnknownCommand().isEmpty()) { - LOG.error("No command entered", e); + LOG.error("No command entered"); } else { - LOG.error("Unknown command", e); + LOG.error("Unknown command"); } - System.exit(FAILURE_EXIT_CODE); + exceptionMessage(e, "The command is missing", GENERIC_ERROR); } catch (ParameterException e) { jCommander.usage(); - LOG.error("Error parsing arguments", e); - System.exit(FAILURE_EXIT_CODE); + exceptionMessage(e, "Error parsing arguments", GENERIC_ERROR); } if (jCommander.getParsedCommand() == null || commandLineArgs.isHelp()) { @@ -98,55 +97,43 @@ public static void main(String[] args) { if (aggregateMetricsCommand.isHelp()) { jCommander.usage(); } else { - final Optional config = getConfiguration(aggregateMetricsCommand.getConfig()); - if (config.isEmpty()) { - System.exit(FAILURE_EXIT_CODE); - } + INIConfiguration config = getConfiguration(aggregateMetricsCommand.getConfig()); try { - final MetricsAggregatorConfig metricsAggregatorConfig = new MetricsAggregatorConfig(config.get()); + final MetricsAggregatorConfig metricsAggregatorConfig = new MetricsAggregatorConfig(config); metricsAggregatorClient.aggregateMetrics(metricsAggregatorConfig); } catch (Exception e) { - LOG.error("Could not aggregate metrics", e); - System.exit(FAILURE_EXIT_CODE); + exceptionMessage(e, "Could not aggregate metrics", GENERIC_ERROR); } } } else if ("submit-validation-data".equals(jCommander.getParsedCommand())) { if (submitValidationData.isHelp()) { jCommander.usage(); } else { - final Optional config = getConfiguration(submitValidationData.getConfig()); - if (config.isEmpty()) { - System.exit(FAILURE_EXIT_CODE); - } + INIConfiguration config = getConfiguration(submitValidationData.getConfig()); try { - final MetricsAggregatorConfig metricsAggregatorConfig = new MetricsAggregatorConfig(config.get()); + final MetricsAggregatorConfig metricsAggregatorConfig = new MetricsAggregatorConfig(config); metricsAggregatorClient.submitValidationData(metricsAggregatorConfig, submitValidationData.getValidator(), submitValidationData.getValidatorVersion(), submitValidationData.getDataFilePath(), submitValidationData.getPlatform()); } catch (Exception e) { - LOG.error("Could not submit validation metrics to Dockstore", e); - System.exit(FAILURE_EXIT_CODE); + exceptionMessage(e, "Could not submit validation metrics to Dockstore", GENERIC_ERROR); } } } else if ("submit-terra-metrics".equals(jCommander.getParsedCommand())) { if (submitTerraMetrics.isHelp()) { jCommander.usage(); } else { - final Optional config = getConfiguration(submitTerraMetrics.getConfig()); - if (config.isEmpty()) { - System.exit(FAILURE_EXIT_CODE); - } + INIConfiguration config = getConfiguration(submitTerraMetrics.getConfig()); try { - final MetricsAggregatorConfig metricsAggregatorConfig = new MetricsAggregatorConfig(config.get()); + final MetricsAggregatorConfig metricsAggregatorConfig = new MetricsAggregatorConfig(config); final TerraMetricsSubmitter submitTerraMetricsCommand = new TerraMetricsSubmitter(metricsAggregatorConfig, submitTerraMetrics); submitTerraMetricsCommand.submitTerraMetrics(); } catch (Exception e) { - LOG.error("Could not submit terra metrics to Dockstore", e); - System.exit(FAILURE_EXIT_CODE); + exceptionMessage(e, "Could not submit Terra metrics to Dockstore", GENERIC_ERROR); } } } diff --git a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java index 27fee9db..53182ac2 100644 --- a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java +++ b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java @@ -1,7 +1,8 @@ package io.dockstore.metricsaggregator.client.cli; -import static io.dockstore.utils.CLIConstants.FAILURE_EXIT_CODE; import static io.dockstore.utils.DockstoreApiClientUtils.setupApiClient; +import static io.dockstore.utils.ExceptionHandler.IO_ERROR; +import static io.dockstore.utils.ExceptionHandler.exceptionMessage; import static java.util.stream.Collectors.groupingBy; import io.dockstore.common.Partner; @@ -125,8 +126,7 @@ public void submitTerraMetrics() { } } } catch (IOException e) { - LOG.error("Unable to create new CSV output file", e); - System.exit(FAILURE_EXIT_CODE); + exceptionMessage(e, "Unable to create new CSV output file", IO_ERROR); } logStats(); @@ -135,8 +135,7 @@ public void submitTerraMetrics() { LOG.info("View skipped executions in file {}", outputFileName); } } catch (IOException e) { - LOG.error("Unable to read input CSV file", e); - System.exit(FAILURE_EXIT_CODE); + exceptionMessage(e, "Unable to read input CSV file", IO_ERROR); } } diff --git a/metricsaggregator/src/test/java/io/dockstore/metricsaggregator/client/cli/MetricsAggregatorClientIT.java b/metricsaggregator/src/test/java/io/dockstore/metricsaggregator/client/cli/MetricsAggregatorClientIT.java index 79401c42..6cde6c79 100644 --- a/metricsaggregator/src/test/java/io/dockstore/metricsaggregator/client/cli/MetricsAggregatorClientIT.java +++ b/metricsaggregator/src/test/java/io/dockstore/metricsaggregator/client/cli/MetricsAggregatorClientIT.java @@ -31,6 +31,7 @@ import static io.dockstore.openapi.client.model.RunExecution.ExecutionStatusEnum.SUCCESSFUL; import static io.dockstore.openapi.client.model.ValidationExecution.ValidatorToolEnum.MINIWDL; import static io.dockstore.openapi.client.model.ValidationExecution.ValidatorToolEnum.WOMTOOL; +import static io.dockstore.utils.ExceptionHandler.IO_ERROR; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; @@ -62,7 +63,6 @@ import io.dockstore.openapi.client.model.ValidatorVersionInfo; import io.dockstore.openapi.client.model.Workflow; import io.dockstore.openapi.client.model.WorkflowVersion; -import io.dockstore.utils.CLIConstants; import io.dockstore.webservice.DockstoreWebserviceApplication; import io.dockstore.webservice.DockstoreWebserviceConfiguration; import io.dropwizard.testing.DropwizardTestSupport; @@ -395,7 +395,7 @@ private static void testOverallAggregatedMetrics(WorkflowVersion version, String @Test void testAggregateMetricsErrors() throws Exception { int exitCode = catchSystemExit(() -> MetricsAggregatorClient.main(new String[] {"aggregate-metrics", "--config", "thisdoesntexist"})); - assertEquals(CLIConstants.FAILURE_EXIT_CODE, exitCode); + assertEquals(IO_ERROR, exitCode); } @Test diff --git a/metricsaggregator/src/test/java/io/dockstore/metricsaggregator/common/TestUtilities.java b/metricsaggregator/src/test/java/io/dockstore/metricsaggregator/common/TestUtilities.java index d79097a7..1247b5ac 100644 --- a/metricsaggregator/src/test/java/io/dockstore/metricsaggregator/common/TestUtilities.java +++ b/metricsaggregator/src/test/java/io/dockstore/metricsaggregator/common/TestUtilities.java @@ -26,7 +26,6 @@ import io.dropwizard.testing.ResourceHelpers; import java.io.File; import java.time.Instant; -import java.util.Optional; import org.apache.commons.configuration2.INIConfiguration; public final class TestUtilities { @@ -60,12 +59,7 @@ public static ValidationExecution createValidationExecution(ValidatorToolEnum va } public static MetricsAggregatorConfig getMetricsConfig() { - Optional iniConfig = ConfigFileUtils.getConfiguration(new File(CONFIG_FILE_PATH)); - if (iniConfig.isEmpty()) { - throw new RuntimeException("Unable to get config file"); - } - - MetricsAggregatorConfig config = new MetricsAggregatorConfig(iniConfig.get()); - return config; + INIConfiguration iniConfig = ConfigFileUtils.getConfiguration(new File(CONFIG_FILE_PATH)); + return new MetricsAggregatorConfig(iniConfig); } } diff --git a/topicgenerator/src/main/java/io/dockstore/topicgenerator/client/cli/TopicGeneratorClient.java b/topicgenerator/src/main/java/io/dockstore/topicgenerator/client/cli/TopicGeneratorClient.java index 5e9fb3b1..309a5af1 100644 --- a/topicgenerator/src/main/java/io/dockstore/topicgenerator/client/cli/TopicGeneratorClient.java +++ b/topicgenerator/src/main/java/io/dockstore/topicgenerator/client/cli/TopicGeneratorClient.java @@ -1,7 +1,9 @@ package io.dockstore.topicgenerator.client.cli; -import static io.dockstore.utils.CLIConstants.FAILURE_EXIT_CODE; import static io.dockstore.utils.ConfigFileUtils.getConfiguration; +import static io.dockstore.utils.ExceptionHandler.GENERIC_ERROR; +import static io.dockstore.utils.ExceptionHandler.IO_ERROR; +import static io.dockstore.utils.ExceptionHandler.exceptionMessage; import com.beust.jcommander.JCommander; import com.beust.jcommander.MissingCommandException; @@ -34,7 +36,6 @@ import java.time.temporal.ChronoUnit; import java.util.HashMap; import java.util.List; -import java.util.Optional; import org.apache.commons.configuration2.INIConfiguration; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVPrinter; @@ -75,15 +76,14 @@ public static void main(String[] args) { } catch (MissingCommandException e) { jCommander.usage(); if (e.getUnknownCommand().isEmpty()) { - LOG.error("No command entered", e); + LOG.error("No command entered"); } else { - LOG.error("Unknown command", e); + LOG.error("Unknown command"); } - System.exit(FAILURE_EXIT_CODE); + exceptionMessage(e, "The command is missing", GENERIC_ERROR); } catch (ParameterException e) { jCommander.usage(); - LOG.error("Error parsing arguments", e); - System.exit(FAILURE_EXIT_CODE); + exceptionMessage(e, "Error parsing arguments", GENERIC_ERROR); } if (jCommander.getParsedCommand() == null || commandLineArgs.isHelp()) { @@ -92,12 +92,8 @@ public static void main(String[] args) { if (generateTopicsCommand.isHelp()) { jCommander.usage(); } else { - final Optional config = getConfiguration(generateTopicsCommand.getConfig()); - if (config.isEmpty()) { - LOG.error("Unable to get topic-generator config file"); - System.exit(FAILURE_EXIT_CODE); - } - final TopicGeneratorConfig topicGeneratorConfig = new TopicGeneratorConfig(config.get()); + final INIConfiguration config = getConfiguration(generateTopicsCommand.getConfig()); + final TopicGeneratorConfig topicGeneratorConfig = new TopicGeneratorConfig(config); // Read CSV file Iterable entriesCsvRecords = null; @@ -110,8 +106,7 @@ public static void main(String[] args) { .build(); entriesCsvRecords = csvFormat.parse(entriesCsv); } catch (IOException e) { - LOG.error("Unable to read input CSV file", e); - System.exit(FAILURE_EXIT_CODE); + exceptionMessage(e, "Unable to read input CSV file", IO_ERROR); } final TopicGeneratorClient topicGeneratorClient = new TopicGeneratorClient(); @@ -165,8 +160,7 @@ private void generateTopics(TopicGeneratorConfig topicGeneratorConfig, Iterable< } } } catch (IOException e) { - LOG.error("Unable to create new CSV output file", e); - System.exit(FAILURE_EXIT_CODE); + exceptionMessage(e, "Unable to create new CSV output file", IO_ERROR); } } diff --git a/utils/src/main/java/io/dockstore/utils/CLIConstants.java b/utils/src/main/java/io/dockstore/utils/CLIConstants.java deleted file mode 100644 index c671e79f..00000000 --- a/utils/src/main/java/io/dockstore/utils/CLIConstants.java +++ /dev/null @@ -1,9 +0,0 @@ -package io.dockstore.utils; - -public final class CLIConstants { - public static final int SUCCESS_EXIT_CODE = 0; - public static final int FAILURE_EXIT_CODE = 1; - - private CLIConstants() { - } -} diff --git a/utils/src/main/java/io/dockstore/utils/ConfigFileUtils.java b/utils/src/main/java/io/dockstore/utils/ConfigFileUtils.java index 306f6a2c..b6390513 100644 --- a/utils/src/main/java/io/dockstore/utils/ConfigFileUtils.java +++ b/utils/src/main/java/io/dockstore/utils/ConfigFileUtils.java @@ -1,7 +1,9 @@ package io.dockstore.utils; +import static io.dockstore.utils.ExceptionHandler.IO_ERROR; +import static io.dockstore.utils.ExceptionHandler.exceptionMessage; + import java.io.File; -import java.util.Optional; import org.apache.commons.configuration2.INIConfiguration; import org.apache.commons.configuration2.SubnodeConfiguration; import org.apache.commons.configuration2.builder.fluent.Configurations; @@ -20,16 +22,16 @@ public final class ConfigFileUtils { private ConfigFileUtils() { } - public static Optional getConfiguration(File iniFile) { + public static INIConfiguration getConfiguration(File iniFile) { Configurations configs = new Configurations(); + INIConfiguration config = null; try { - INIConfiguration config = configs.ini(iniFile); - return Optional.of(config); + config = configs.ini(iniFile); } catch (ConfigurationException e) { - LOG.error(CONFIG_FILE_ERROR, e); - return Optional.empty(); + exceptionMessage(e, CONFIG_FILE_ERROR, IO_ERROR); } + return config; } public static SubnodeConfiguration getDockstoreSection(INIConfiguration iniConfig) { From 0737cd9d04a41136e5334af910911b6467b8f6a1 Mon Sep 17 00:00:00 2001 From: Kathy Tran Date: Mon, 15 Jan 2024 11:20:18 -0500 Subject: [PATCH 15/19] Make description more generic and add optional description argument --- metricsaggregator/README.md | 3 +++ .../metricsaggregator/client/cli/CommandLineArgs.java | 8 +++++++- .../client/cli/TerraMetricsSubmitter.java | 7 +++++-- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/metricsaggregator/README.md b/metricsaggregator/README.md index febed548..784b5575 100644 --- a/metricsaggregator/README.md +++ b/metricsaggregator/README.md @@ -90,6 +90,9 @@ Usage:
[options] [command] [command options] * -d, --data The file path to the CSV file containing workflow metrics from Terra. The first line of the file should contain the CSV fields: workflow_id,status,workflow_start,workflow_end,workflow_runtime_minutes,source_url + -de, --description + Optional description about the metrics to include when submitting + metrics to Dockstore --help Prints help for metricsaggregator -r, --recordSkipped diff --git a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/CommandLineArgs.java b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/CommandLineArgs.java index 72e52ac6..adb93a31 100644 --- a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/CommandLineArgs.java +++ b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/CommandLineArgs.java @@ -91,15 +91,21 @@ public static class SubmitTerraMetrics extends CommandLineArgs { @Parameter(names = {"-r", "--recordSkipped"}, description = "Record skipped executions and the reason skipped to a CSV file") private boolean recordSkippedExecutions; + @Parameter(names = {"-de", "--description"}, description = "Optional description about the metrics to include when submitting metrics to Dockstore") + private String description; + public File getConfig() { return config; } - public String getDataFilePath() { return dataFilePath; } + public String getDescription() { + return description; + } + public boolean isRecordSkippedExecutions() { return recordSkippedExecutions; } diff --git a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java index 53182ac2..925ae2b8 100644 --- a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java +++ b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java @@ -174,8 +174,11 @@ private void submitWorkflowExecutions(String sourceUrl, List workflow .toList(); final ExecutionsRequestBody executionsRequestBody = new ExecutionsRequestBody().runExecutions(workflowExecutionsToSubmit); try { - extendedGa4GhApi.executionMetricsPost(executionsRequestBody, Partner.TERRA.toString(), sourceUrlTrsInfo.trsId(), sourceUrlTrsInfo.version(), - "Terra metrics from Q4 2023"); + String description = "Submitted using the metricsaggregator's submit-terra-metrics command"; + if (StringUtils.isNotBlank(submitTerraMetricsCommand.getDescription())) { + description += ". " + submitTerraMetricsCommand.getDescription(); + } + extendedGa4GhApi.executionMetricsPost(executionsRequestBody, Partner.TERRA.toString(), sourceUrlTrsInfo.trsId(), sourceUrlTrsInfo.version(), description); numberOfExecutionsSubmitted.addAndGet(workflowMetricRecords.size()); } catch (ApiException e) { logSkippedExecutions(sourceUrlTrsInfo.sourceUrl(), workflowMetricRecords, String.format("Could not submit execution metrics to Dockstore for workflow %s: %s", sourceUrlTrsInfo, e.getMessage()), skippedExecutionsCsvPrinter, false); From 350c8e2668b8cbaf430539caf538dc180cf6cb6d Mon Sep 17 00:00:00 2001 From: Kathy Tran Date: Thu, 18 Jan 2024 10:12:29 -0500 Subject: [PATCH 16/19] Submit terra execution ID --- .../MetricsAggregatorS3Client.java | 15 --------------- .../client/cli/TerraMetricsSubmitter.java | 3 +-- 2 files changed, 1 insertion(+), 17 deletions(-) diff --git a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/MetricsAggregatorS3Client.java b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/MetricsAggregatorS3Client.java index 0fd4d4bd..f86e3fb2 100644 --- a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/MetricsAggregatorS3Client.java +++ b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/MetricsAggregatorS3Client.java @@ -40,7 +40,6 @@ import java.util.List; import java.util.Map; import java.util.Queue; -import java.util.UUID; import java.util.concurrent.atomic.AtomicInteger; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -215,20 +214,6 @@ private ExecutionsRequestBody getExecutions(String toolId, String versionName, S .aggregatedExecutions(executionIdToAggregatedExecutionMap.values().stream().toList()); } - /** - * If the execution ID is null, generate a random one for the purposes of aggregation. - * Executions that were submitted to S3 prior to the existence of execution IDs don't have an execution ID, - * thus for the purposes of aggregation, generate one. - * @param executionId - * @return - */ - private String generateExecutionIdIfNull(String executionId) { - if (executionId == null) { - return UUID.randomUUID().toString(); - } - return executionId; - } - /** * Returns a unique list of directories containing metrics files. * For example, suppose the local-dockstore-metrics-data bucket looks like the following. diff --git a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java index 925ae2b8..9c78a9b3 100644 --- a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java +++ b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java @@ -276,8 +276,7 @@ public Optional getTerraWorkflowExecutionFromCsvRecord(CSVRecord c } RunExecution workflowExecution = new RunExecution(); - // TODO: uncomment below when the update executions endpoint PR is merged - //workflowExecution.setExecutionId(executionId); + workflowExecution.setExecutionId(executionId); workflowExecution.setExecutionStatus(executionStatus.get()); workflowExecution.setDateExecuted(dateExecuted.get()); getExecutionTime(workflowRunTimeMinutes).ifPresent(workflowExecution::setExecutionTime); From 62938abc8e8bba369f2f5bca14b907fe471da88d Mon Sep 17 00:00:00 2001 From: Kathy Tran Date: Thu, 18 Jan 2024 10:35:20 -0500 Subject: [PATCH 17/19] Fix test --- .../metricsaggregator/client/cli/TerraMetricsSubmitter.java | 4 ++++ .../client/cli/TerraMetricsSubmitterTest.java | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java index 9c78a9b3..9ae67202 100644 --- a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java +++ b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java @@ -402,6 +402,10 @@ private Optional calculateTrsInfoFromSourceUrl(List * @return */ static List getSourceUrlComponents(String sourceUrl) { + final String rawGitHubUrlPrefix = "https://raw.githubusercontent.com/"; + if (!sourceUrl.startsWith(rawGitHubUrlPrefix)) { + return List.of(); + } final String sourceUrlWithoutGitHubPrefix = sourceUrl.replace("https://raw.githubusercontent.com/", ""); return Arrays.stream(sourceUrlWithoutGitHubPrefix.split("/")) .filter(urlComponent -> !urlComponent.isEmpty()) // Filter out empty strings that are a result of consecutive slashes '//' diff --git a/metricsaggregator/src/test/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitterTest.java b/metricsaggregator/src/test/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitterTest.java index 50b431c4..b27023b6 100644 --- a/metricsaggregator/src/test/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitterTest.java +++ b/metricsaggregator/src/test/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitterTest.java @@ -46,7 +46,7 @@ void testGetSourceUrlComponents() { assertEquals(List.of("theiagen", "public_health_viral_genomics", "v2.0.0", "workflows", "wf_theiacov_fasta.wdl"), getSourceUrlComponents("https://raw.githubusercontent.com/theiagen/public_health_viral_genomics/v2.0.0/workflows/wf_theiacov_fasta.wdl")); // This source_url has consecutive slashes assertEquals(List.of("theiagen", "public_health_viral_genomics", "v2.0.0", "workflows", "wf_theiacov_fasta.wdl"), getSourceUrlComponents("https://raw.githubusercontent.com/theiagen/public_health_viral_genomics/v2.0.0//workflows/wf_theiacov_fasta.wdl")); - assertEquals(List.of(), getSourceUrlComponents("https://raw.githubusercontent.com/theiagen/public_health_viral_genomics/v2.0.0//workflows/wf_theiacov_fasta.wdl")); + assertEquals(List.of(), getSourceUrlComponents("https://nottherawgithuburlprefix/theiagen/public_health_viral_genomics/v2.0.0//workflows/wf_theiacov_fasta.wdl")); } @Test From c66bb0dd921d03729d80cfa88226af0a6f0704f9 Mon Sep 17 00:00:00 2001 From: Kathy Tran Date: Thu, 18 Jan 2024 15:12:01 -0500 Subject: [PATCH 18/19] Log skipped executions to console --- metricsaggregator/README.md | 8 +++-- .../client/cli/TerraMetricsSubmitter.java | 32 +++++++++++++++---- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/metricsaggregator/README.md b/metricsaggregator/README.md index 784b5575..c9283644 100644 --- a/metricsaggregator/README.md +++ b/metricsaggregator/README.md @@ -69,11 +69,15 @@ Usage:
[options] [command] [command options] format of the workflows that were validated by the validator specified. The first line of the file should contain the CSV fields: trsID,versionName,isValid,dateExecuted + -id, --executionId + The execution ID to use for each validation execution. Assumes + that each validation in the file is performed on unique workflows + and workflow versions. --help Prints help for metricsaggregator * -p, --platform The platform that the workflow was validated on - Possible Values: [GALAXY, TERRA, DNA_STACK, DNA_NEXUS, CGC, NHLBI_BIODATA_CATALYST, ANVIL, CAVATICA, NEXTFLOW_TOWER, ELWAZI, AGC, OTHER] + Possible Values: [GALAXY, TERRA, DNA_STACK, DNA_NEXUS, CGC, NHLBI_BIODATA_CATALYST, ANVIL, CAVATICA, NEXTFLOW_TOWER, ELWAZI, AGC, OTHER, ALL] * -v, --validator The validator tool used to validate the workflows Possible Values: [MINIWDL, WOMTOOL, CWLTOOL, NF_VALIDATION, OTHER] @@ -117,7 +121,7 @@ with miniwdl on DNAstack: ``` java -jar target/metricsaggregator-*-SNAPSHOT.jar submit-validation-data --config my-custom-config \ ---data --validator MINIWDL --validatorVersion 1.0 --platform DNA_STACK +--data --validator MINIWDL --validatorVersion 1.0 --platform DNA_STACK --executionId a02075d9-092a-4fe7-9f83-4abf11de3dc9 ``` After running this command, you will want to run the `aggregate-metrics` command to aggregate the new validation data submitted. diff --git a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java index 9ae67202..70dc2df4 100644 --- a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java +++ b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/client/cli/TerraMetricsSubmitter.java @@ -194,8 +194,13 @@ private void submitWorkflowExecutions(String sourceUrl, List workflow * @param reason Reason that this execution is being skipped * @param skippedExecutionsCsvPrinter CSVPrinter that writes to the output file * @param skipFutureExecutionsWithSourceUrl boolean indicating if all executions with the same source_url should be skipped + * @param logToConsole boolean indicating if the reason skipped should be logged to the console */ - private void logSkippedExecution(String sourceUrl, CSVRecord csvRecordToSkip, String reason, CSVPrinter skippedExecutionsCsvPrinter, boolean skipFutureExecutionsWithSourceUrl) { + private void logSkippedExecution(String sourceUrl, CSVRecord csvRecordToSkip, String reason, CSVPrinter skippedExecutionsCsvPrinter, boolean skipFutureExecutionsWithSourceUrl, boolean logToConsole) { + if (logToConsole) { + LOG.warn("Skipping execution on row {} with source_url {}: {}", csvRecordToSkip.getRecordNumber(), sourceUrl, reason); + } + // Record to map for future reference. Only want to do this if the skip reason applies for ALL executions with the source_url. // Should not add to this map if the skip reason is specific to one execution if (skipFutureExecutionsWithSourceUrl) { @@ -217,6 +222,18 @@ private void logSkippedExecution(String sourceUrl, CSVRecord csvRecordToSkip, St numberOfExecutionsSkipped.incrementAndGet(); } + /** + * Performs logging and writing of the skipped execution to the console and output file. + * Assumes that the execution being skipped is for a reason that is only applicable to that execution. + * @param sourceUrl source_url of the csvRecordToSkip + * @param csvRecordToSkip CSVRecord to skip + * @param reason Reason that this execution is being skipped + * @param skippedExecutionsCsvPrinter CSVPrinter that writes to the output file + */ + private void logSkippedExecution(String sourceUrl, CSVRecord csvRecordToSkip, String reason, CSVPrinter skippedExecutionsCsvPrinter) { + logSkippedExecution(sourceUrl, csvRecordToSkip, reason, skippedExecutionsCsvPrinter, false, true); + } + /** * Performs logging and writing of the skipped executions with the same sourceUrl to an output file. Assumes that all executions are skipped for the same reason. * If skipFutureExecutionsWithSourceUrl is true, also adds the source_url of the skipped execution to the sourceUrlsToSkipToReason map @@ -228,7 +245,8 @@ private void logSkippedExecution(String sourceUrl, CSVRecord csvRecordToSkip, St * @param skipFutureExecutionsWithSourceUrl boolean indicating if all executions with the same source_url should be skipped */ private void logSkippedExecutions(String sourceUrl, List csvRecordsToSkip, String reason, CSVPrinter skippedExecutionsCsvPrinter, boolean skipFutureExecutionsWithSourceUrl) { - csvRecordsToSkip.forEach(csvRecordToSkip -> logSkippedExecution(sourceUrl, csvRecordToSkip, reason, skippedExecutionsCsvPrinter, skipFutureExecutionsWithSourceUrl)); + LOG.warn("Skipping {} executions with source_url {}: {}", csvRecordsToSkip.size(), sourceUrl, reason); + csvRecordsToSkip.forEach(csvRecordToSkip -> logSkippedExecution(sourceUrl, csvRecordToSkip, reason, skippedExecutionsCsvPrinter, skipFutureExecutionsWithSourceUrl, false)); } /** @@ -248,30 +266,30 @@ public Optional getTerraWorkflowExecutionFromCsvRecord(CSVRecord c // Check that all required fields are present if (StringUtils.isBlank(executionId)) { - logSkippedExecution(sourceUrl, csvRecord, "The required field workflow_id is missing", skippedExecutionsCsvPrinter, false); + logSkippedExecution(sourceUrl, csvRecord, "The required field workflow_id is missing", skippedExecutionsCsvPrinter); return Optional.empty(); } if (StringUtils.isBlank(workflowStart)) { - logSkippedExecution(sourceUrl, csvRecord, "The required field workflow_start is missing", skippedExecutionsCsvPrinter, false); + logSkippedExecution(sourceUrl, csvRecord, "The required field workflow_start is missing", skippedExecutionsCsvPrinter); return Optional.empty(); } if (StringUtils.isBlank(status)) { - logSkippedExecution(sourceUrl, csvRecord, "The required field status is missing", skippedExecutionsCsvPrinter, false); + logSkippedExecution(sourceUrl, csvRecord, "The required field status is missing", skippedExecutionsCsvPrinter); return Optional.empty(); } // Format fields into Dockstore schema final Optional executionStatus = getExecutionStatusEnumFromTerraStatus(status); if (executionStatus.isEmpty()) { - logSkippedExecution(sourceUrl, csvRecord, "Could not get a valid ExecutionStatusEnum from status '" + status + "'", skippedExecutionsCsvPrinter, false); + logSkippedExecution(sourceUrl, csvRecord, "Could not get a valid ExecutionStatusEnum from status '" + status + "'", skippedExecutionsCsvPrinter); return Optional.empty(); } final Optional dateExecuted = formatStringInIso8601Date(workflowStart); if (dateExecuted.isEmpty()) { - logSkippedExecution(sourceUrl, csvRecord, "Could not get a valid dateExecuted from workflow_start '" + workflowStart + "'", skippedExecutionsCsvPrinter, false); + logSkippedExecution(sourceUrl, csvRecord, "Could not get a valid dateExecuted from workflow_start '" + workflowStart + "'", skippedExecutionsCsvPrinter); return Optional.empty(); } From 125f9d73f87f439e60420b0ab6559029444fd3bb Mon Sep 17 00:00:00 2001 From: Kathy Tran Date: Fri, 19 Jan 2024 11:50:52 -0500 Subject: [PATCH 19/19] Add more logging --- .../MetricsAggregatorS3Client.java | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/MetricsAggregatorS3Client.java b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/MetricsAggregatorS3Client.java index f86e3fb2..179b3337 100644 --- a/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/MetricsAggregatorS3Client.java +++ b/metricsaggregator/src/main/java/io/dockstore/metricsaggregator/MetricsAggregatorS3Client.java @@ -39,6 +39,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.Queue; import java.util.concurrent.atomic.AtomicInteger; import org.slf4j.Logger; @@ -110,12 +111,16 @@ private void aggregateMetricsForDirectory(S3DirectoryInfo directoryInfo, Extende } try { - getAggregatedMetrics(allSubmissions).ifPresent(metrics -> { - extendedGa4GhApi.aggregatedMetricsPut(metrics, platform, toolId, versionName); - LOG.info("Aggregated metrics for tool ID {}, version {}, platform {} from directory {}", toolId, versionName, platform, versionS3KeyPrefix); - allMetrics.add(metrics); + Optional aggregatedPlatformMetric = getAggregatedMetrics(allSubmissions); + if (aggregatedPlatformMetric.isPresent()) { + extendedGa4GhApi.aggregatedMetricsPut(aggregatedPlatformMetric.get(), platform, toolId, versionName); + LOG.info("Aggregated metrics for tool ID {}, version {}, platform {} from directory {}", toolId, versionName, platform, + versionS3KeyPrefix); + allMetrics.add(aggregatedPlatformMetric.get()); numberOfMetricsSubmitted.incrementAndGet(); - }); + } else { + LOG.error("Error aggregating metrics for tool ID {}, version {}, platform {} from directory {}", toolId, versionName, platform, versionS3KeyPrefix); + } } catch (Exception e) { LOG.error("Error aggregating metrics: Could not put all executions from directory {}", versionS3KeyPrefix, e); numberOfMetricsSkipped.incrementAndGet(); @@ -139,6 +144,7 @@ private void aggregateMetricsForDirectory(S3DirectoryInfo directoryInfo, Extende // Continue aggregating metrics for other directories } } else { + LOG.error("Error aggregating metrics for directory {}: no platform metrics aggregated", versionS3KeyPrefix); numberOfMetricsSkipped.incrementAndGet(); } numberOfDirectoriesProcessed.incrementAndGet(); @@ -170,7 +176,7 @@ private ExecutionsRequestBody getExecutions(String toolId, String versionName, S try { executionsFromOneSubmission = GSON.fromJson(fileContent, ExecutionsRequestBody.class); } catch (JsonSyntaxException e) { - LOG.error("Could not read execution(s) from S3 key {}, ignoring file", metricsData.s3Key()); + LOG.error("Could not read execution(s) from S3 key {}, ignoring file", metricsData.s3Key(), e); continue; }