Remove expanded config files. Move entity-level hint computation into…

… indexing job. (#485) * Move hint generation into indexing job. Change service to call BQ. * Require data type in original config file. * Remove expanded broad/cms_synpuf. * Remove expanded broad/aou_synthetic. * Remove expanded verily/cms_synpuf. * Remove expanded verily/aou_synthetic. * Remove expanded verily/pilot_synthea_2022q3. * Remove expanded verily/sdd. * Remove expanded verily/sdd_refresh0323. * Combine create+compute jobs. Remove expanded from underlay file paths. * Remove expanded vumc/sdd. * Remove expanded vumc/sdd_refresh0323. * Update tests with new index dataset name. * Update verify-config GHA: remove expanded check, update pairs of underlays that should be in sync. * Fix comment in hints controller. * Rename command EXPAND_CONFIG -> VALIDATE_CONFIG. Add logging output when data types don't match expected. * Remove expanded aou/test/SC2022Q4R6. * Remove expanded aou/test/SR2022Q4R6. * Remove SNP entity from verily/sdd. This underlay is out-of-date. * Add logging in data type validation loop. Remove expanded in run server script for aou/test. * Remove SNP entity from vumc/sdd. This underlay is out-of-date. * Update docs with command change EXPAND_CONFIG > VALIDATE_CONFIG. * Disable SDD queries test. * Fix modifier hint attribute lookup bug.
DataBiosphere · Jul 31, 2023 · ab29390 · ab29390
1 parent 7d9c04d
commit ab29390
Show file tree

Hide file tree

Showing 1,045 changed files with 4,414 additions and 76,076 deletions.
diff --git a/.github/tools/verify_config_has_been_expanded.sh b/.github/tools/verify_config_has_been_expanded.sh
diff --git a/.github/tools/verify_config_in_sync.sh b/.github/tools/verify_config_in_sync.sh
@@ -7,8 +7,8 @@
 
 configs_to_compare_list="
 cms_synpuf broad/cms_synpuf verily/cms_synpuf\n
-aou_synthetic broad/aou_synthetic verily/aou_synthetic\n
-sdd vumc/sdd verily/sdd
+sdd vumc/sdd verily/sdd\n
+sdd_refresh0323 vumc/sdd_refresh0323 verily/sdd_refresh0323
 "
 
 # Needed for for loop to split only on newline, not on space
@@ -23,8 +23,8 @@ do
   underlay_2=$(echo ${configs_to_compare} | awk '{print $3}')
   printf "\nComparing ${underlay_1} to ${underlay_2}\n"
 
-  underlay_dir_1=$(echo service/src/main/resources/config/${underlay_1}/original)
-  underlay_dir_2=$(echo service/src/main/resources/config/${underlay_2}/original)
+  underlay_dir_1=$(echo service/src/main/resources/config/${underlay_1})
+  underlay_dir_2=$(echo service/src/main/resources/config/${underlay_2})
   # --ignore-all-space because files sometimes have newline and at of file, and sometimes don't
   diff_output=$(diff -rq --ignore-all-space --exclude ${underlay_name}.json --exclude sql ${underlay_dir_1} ${underlay_dir_2})
 

diff --git a/.github/workflows/validate-config.yaml b/.github/workflows/validate-config.yaml
@@ -44,8 +44,3 @@ jobs:
         TEST_PROJECT_SA_KEY: ${{ secrets.TEST_PROJECT_SA_KEY }}
     - name: Verify configs are in sync, for example vumc/sdd and verily/sdd
       run: .github/tools/verify_config_in_sync.sh
-    - name: Verify underlay configs have been expanded
-      if: always()
-      run: .github/tools/verify_config_has_been_expanded.sh
-      env:
-        GOOGLE_APPLICATION_CREDENTIALS: ../rendered/broad/tanagra_sa.json
diff --git a/docs/INDEXING.md b/docs/INDEXING.md
@@ -6,7 +6,7 @@
     + [Setup Credentials](#setup-credentials)
       + [Default Application Credentials](#default-application-credentials)
       + [gcloud Credentials](#gcloud-credentials)
-    + [Expand Underlay Config](#expand-underlay-config)
+    + [Validate Underlay Config](#validate-underlay-config)
     + [Create Index Dataset](#create-index-dataset)
     + [Kickoff Jobs](#kickoff-jobs)
       + [All Jobs](#all-jobs)
@@ -63,8 +63,7 @@ Before running the indexing jobs, you need to specify the underlay config files.
 There are 3 steps to generating the index tables:
 1. [Setup](#setup-credentials) credentials with read permissions on the source data, and read-write permissions on 
 the index data.
-2. [Expand](#expand-underlay-config) the user-specified underlay config to include information from scanning the 
-source data. For example, data types and UI hints.
+2. [Validate](#validate-underlay-config) the user-specified underlay config.
 3. [Create](#create-index-dataset) the index dataset, if it doesn't already exist.
 4. [Kickoff](#kickoff-jobs) the jobs.
 
@@ -103,13 +102,14 @@ To use end-user credentials:
 gcloud auth login
 ```
 
-### Expand Underlay Config
-Expand the defaults, scan the source data, and generate an expanded underlay config file that includes all this 
-information. The first argument is a pointer to the user-specified underlay file.
-The second argument is a pointer to the directory where Tanagra can write the expanded config files.
-Both arguments must be absolute paths. Example:
+### Validate Underlay Config
+Validate the config files by scanning the source dataset. This not strictly required, but is highly recommended.
+In particular, this will check to see if the attribute data types you specified match the underlying source data.
+If they don't, or you did not specify a data type, then this command will print out what you should set it to.
+
+The argument is a pointer to the top-level config file. It must be an absolute path. Example:
 ```
-./gradlew indexer:index -Dexec.args="EXPAND_CONFIG $HOME/tanagra/service/src/main/resources/config/broad/cms_synpuf/original/cms_synpuf.json $HOME/tanagra/service/src/main/resources/config/broad/cms_synpuf/expanded"
+./gradlew indexer:index -Dexec.args="VALIDATE_CONFIG $HOME/tanagra/service/src/main/resources/config/broad/cms_synpuf/cms_synpuf.json"
 ```
 
 ### Create Index Dataset
@@ -128,11 +128,11 @@ Do a dry run of all the indexing jobs. This provides a sanity check that the ind
 query inputs, are valid. This step is not required, but highly recommended to help catch errors/bugs sooner and without 
 running a bunch of computation first.
 ```
-./gradlew indexer:index -Dexec.args="INDEX_ALL $HOME/tanagra/service/src/main/resources/config/output/omop.json DRY_RUN"
+./gradlew indexer:index -Dexec.args="INDEX_ALL $HOME/tanagra/service/src/main/resources/config/omop/omop.json DRY_RUN"
 ```
 Now actually kick off all the indexing jobs.
 ```
-./gradlew indexer:index -Dexec.args="INDEX_ALL $HOME/tanagra/service/src/main/resources/config/output/omop.json"
+./gradlew indexer:index -Dexec.args="INDEX_ALL $HOME/tanagra/service/src/main/resources/config/omop/omop.json"
 ```
 This can take a long time to complete. If e.g. your computer falls asleep or you need to kill the process on your
 computer, you can re-run the same command again. You need to check that there are no in-progress Dataflow jobs in the
@@ -146,13 +146,13 @@ kicking them off again.
 You can also kickoff the indexing jobs for a single entity or entity group. This is helpful for testing and debugging.
 To kick off all the indexing jobs for a particular entity:
 ```
-./gradlew indexer:index -Dexec.args="INDEX_ENTITY $HOME/tanagra/service/src/main/resources/config/output/omop.json person DRY_RUN"
-./gradlew indexer:index -Dexec.args="INDEX_ENTITY $HOME/tanagra/service/src/main/resources/config/output/omop.json person"
+./gradlew indexer:index -Dexec.args="INDEX_ENTITY $HOME/tanagra/service/src/main/resources/config/omop/omop.json person DRY_RUN"
+./gradlew indexer:index -Dexec.args="INDEX_ENTITY $HOME/tanagra/service/src/main/resources/config/omop/omop.json person"
 ```
 or entity group:
 ```
-./gradlew indexer:index -Dexec.args="INDEX_ENTITY_GROUP $HOME/tanagra/service/src/main/resources/config/output/omop.json condition_occurrence_person DRY_RUN"
-./gradlew indexer:index -Dexec.args="INDEX_ENTITY_GROUP $HOME/tanagra/service/src/main/resources/config/output/omop.json condition_occurrence_person"
+./gradlew indexer:index -Dexec.args="INDEX_ENTITY_GROUP $HOME/tanagra/service/src/main/resources/config/omop/omop.json condition_occurrence_person DRY_RUN"
+./gradlew indexer:index -Dexec.args="INDEX_ENTITY_GROUP $HOME/tanagra/service/src/main/resources/config/omop/omop.json condition_occurrence_person"
 ```
 All the entities in a group should be indexed before the group. The `INDEX_ALL` command ensures this ordering, but keep 
 this in  mind if you're running the jobs for each entity or entity group separately.
@@ -163,8 +163,8 @@ this in  mind if you're running the jobs for each entity or entity group separat
 By default, the indexing jobs are run concurrently as much as possible. You can force it to run jobs serially by
 appending `SERIAL` to the command:
 ```
-./gradlew indexer:index -Dexec.args="INDEX_ALL $HOME/tanagra/service/src/main/resources/config/output/omop.json DRY_RUN SERIAL"
-./gradlew indexer:index -Dexec.args="INDEX_ALL $HOME/tanagra/service/src/main/resources/config/output/omop.json NOT_DRY_RUN SERIAL"
+./gradlew indexer:index -Dexec.args="INDEX_ALL $HOME/tanagra/service/src/main/resources/config/omop/omop.json DRY_RUN SERIAL"
+./gradlew indexer:index -Dexec.args="INDEX_ALL $HOME/tanagra/service/src/main/resources/config/omop/omop.json NOT_DRY_RUN SERIAL"
 ```
 
 ### Re-Run Jobs
@@ -174,18 +174,18 @@ commands below. Similar to the indexing commands, the clean commands also respec
 
 To clean the generated index tables for everything:
 ```
-./gradlew indexer:index -Dexec.args="CLEAN_ALL $HOME/tanagra/service/src/main/resources/config/output/omop.json DRY_RUN"
-./gradlew indexer:index -Dexec.args="CLEAN_ALL $HOME/tanagra/service/src/main/resources/config/output/omop.json"
+./gradlew indexer:index -Dexec.args="CLEAN_ALL $HOME/tanagra/service/src/main/resources/config/omop/omop.json DRY_RUN"
+./gradlew indexer:index -Dexec.args="CLEAN_ALL $HOME/tanagra/service/src/main/resources/config/omop/omop.json"
 ```
 or a particular entity:
 ```
-./gradlew indexer:index -Dexec.args="CLEAN_ENTITY $HOME/tanagra/service/src/main/resources/config/output/omop.json person DRY_RUN"
-./gradlew indexer:index -Dexec.args="CLEAN_ENTITY $HOME/tanagra/service/src/main/resources/config/output/omop.json person"
+./gradlew indexer:index -Dexec.args="CLEAN_ENTITY $HOME/tanagra/service/src/main/resources/config/omop/omop.json person DRY_RUN"
+./gradlew indexer:index -Dexec.args="CLEAN_ENTITY $HOME/tanagra/service/src/main/resources/config/omop/omop.json person"
 ```
 or a particular entity group:
 ```
-./gradlew indexer:index -Dexec.args="CLEAN_ENTITY_GROUP $HOME/tanagra/service/src/main/resources/config/output/omop.json person DRY_RUN"
-./gradlew indexer:index -Dexec.args="CLEAN_ENTITY_GROUP $HOME/tanagra/service/src/main/resources/config/output/omop.json person"
+./gradlew indexer:index -Dexec.args="CLEAN_ENTITY_GROUP $HOME/tanagra/service/src/main/resources/config/omop/omop.json person DRY_RUN"
+./gradlew indexer:index -Dexec.args="CLEAN_ENTITY_GROUP $HOME/tanagra/service/src/main/resources/config/omop/omop.json person"
 ```
 
 ### Run dataflow locally
@@ -216,13 +216,12 @@ You can see the underlay config files defined for this dataset in
 Note that while the source dataset is public, the index dataset that Tanagra generates is not.
 
 ```
-export INPUT_DIR=$HOME/tanagra/service/src/main/resources/config/broad/cms_synpuf/original
-export OUTPUT_DIR=$HOME/tanagra/service/src/main/resources/config/broad/cms_synpuf/expanded
+export CONFIG_FILE=$HOME/tanagra/service/src/main/resources/config/broad/cms_synpuf/cms_synpuf.json
 
-./gradlew indexer:index -Dexec.args="EXPAND_CONFIG $INPUT_DIR/cms_synpuf.json $OUTPUT_DIR/"
+./gradlew indexer:index -Dexec.args="VALIDATE_CONFIG $CONFIG_FILE"
 
 bq mk --location=US broad-tanagra-dev:cmssynpuf_index
 
-./gradlew indexer:index -Dexec.args="INDEX_ALL $OUTPUT_DIR/cms_synpuf.json DRY_RUN"
-./gradlew indexer:index -Dexec.args="INDEX_ALL $OUTPUT_DIR/cms_synpuf.json"
+./gradlew indexer:index -Dexec.args="INDEX_ALL $CONFIG_FILE DRY_RUN"
+./gradlew indexer:index -Dexec.args="INDEX_ALL $CONFIG_FILE"
 ```
diff --git a/indexer/build.gradle b/indexer/build.gradle
@@ -73,7 +73,6 @@ tasks.withType(Test) {
     }
 }
 
-// e.g. ./gradlew indexer:index -Dexec.args="input_underlay.json output_dir"
 task index (type:JavaExec) {
     main = "bio.terra.tanagra.indexing.Main"
     classpath = sourceSets.main.runtimeClasspath

diff --git a/indexer/src/main/java/bio/terra/tanagra/indexing/BigQueryIndexingJob.java b/indexer/src/main/java/bio/terra/tanagra/indexing/BigQueryIndexingJob.java
@@ -99,6 +99,19 @@ protected boolean checkTableExists(TablePointer tablePointer) {
         .isPresent();
   }
 
+  protected boolean checkOneRowExists(TablePointer tablePointer) {
+    // Check if the table has at least one row.
+    BigQueryDataset dataPointer = getBQDataPointer(tablePointer);
+    int numRows =
+        dataPointer
+            .getBigQueryService()
+            .getNumRows(
+                dataPointer.getProjectId(),
+                dataPointer.getDatasetId(),
+                tablePointer.getTableName());
+    return numRows > 0;
+  }
+
   protected boolean checkOneNotNullIdRowExists(Entity entity) {
     // Check if the table has at least 1 id row where id IS NOT NULL
     FieldPointer idField =

diff --git a/indexer/src/main/java/bio/terra/tanagra/indexing/Indexer.java b/indexer/src/main/java/bio/terra/tanagra/indexing/Indexer.java
@@ -1,24 +1,18 @@
 package bio.terra.tanagra.indexing;
 
-import bio.terra.tanagra.indexing.job.BuildNumChildrenAndPaths;
-import bio.terra.tanagra.indexing.job.BuildTextSearchStrings;
-import bio.terra.tanagra.indexing.job.ComputeDisplayHints;
-import bio.terra.tanagra.indexing.job.ComputeRollupCounts;
-import bio.terra.tanagra.indexing.job.CreateEntityTable;
-import bio.terra.tanagra.indexing.job.DenormalizeEntityInstances;
-import bio.terra.tanagra.indexing.job.WriteAncestorDescendantIdPairs;
-import bio.terra.tanagra.indexing.job.WriteParentChildIdPairs;
-import bio.terra.tanagra.indexing.job.WriteRelationshipIdPairs;
+import bio.terra.tanagra.exception.InvalidConfigException;
+import bio.terra.tanagra.indexing.job.*;
 import bio.terra.tanagra.indexing.jobexecutor.JobRunner;
 import bio.terra.tanagra.indexing.jobexecutor.ParallelRunner;
 import bio.terra.tanagra.indexing.jobexecutor.SequencedJobSet;
 import bio.terra.tanagra.indexing.jobexecutor.SerialRunner;
+import bio.terra.tanagra.query.Literal;
 import bio.terra.tanagra.underlay.*;
 import bio.terra.tanagra.underlay.entitygroup.CriteriaOccurrence;
 import bio.terra.tanagra.underlay.entitygroup.GroupItems;
 import com.google.common.annotations.VisibleForTesting;
 import java.io.IOException;
-import java.util.List;
+import java.util.*;
 import java.util.stream.Collectors;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -54,19 +48,55 @@ public static Indexer deserializeUnderlay(String underlayFileName) throws IOExce
     return new Indexer(Underlay.fromJSON(underlayFileName));
   }
 
-  /** Scan the source data to validate data pointers, lookup data types, generate UI hints, etc. */
-  public void scanSourceData() {
-    // TODO: Validate existence and access for data/table/field pointers.
-    underlay
-        .getEntities()
-        .values()
+  public void validateConfig() {
+    // Check that the attribute data types are all defined and match the expected.
+    Map<String, List<String>> errorsForEntity = new HashMap<>();
+    underlay.getEntities().values().stream()
+        .sorted(Comparator.comparing(Entity::getName))
         .forEach(
-            e -> {
-              LOGGER.info(
-                  "Looking up attribute data types and generating UI hints for entity: "
-                      + e.getName());
-              e.scanSourceData();
+            entity -> {
+              List<String> errors = new ArrayList<>();
+              entity.getAttributes().stream()
+                  .sorted(Comparator.comparing(Attribute::getName))
+                  .forEach(
+                      attribute -> {
+                        LOGGER.info(
+                            "Validating data type for entity {}, attribute {}",
+                            entity.getName(),
+                            attribute.getName());
+                        Literal.DataType computedDataType =
+                            attribute.getMapping(Underlay.MappingType.SOURCE).computeDataType();
+                        if (attribute.getDataType() == null
+                            || !attribute.getDataType().equals(computedDataType)) {
+                          String msg =
+                              "attribute: "
+                                  + attribute.getName()
+                                  + ", expected data type: "
+                                  + computedDataType
+                                  + ", actual data type: "
+                                  + attribute.getDataType();
+                          errors.add(msg);
+                          LOGGER.info("entity: {}, {}", entity.getName(), msg);
+                        }
+                      });
+              if (!errors.isEmpty()) {
+                errorsForEntity.put(entity.getName(), errors);
+              }
             });
+
+    // Output any error messages.
+    if (errorsForEntity.isEmpty()) {
+      LOGGER.info("Validation of attribute data types succeeded");
+    } else {
+      errorsForEntity.keySet().stream()
+          .sorted()
+          .forEach(
+              entityName -> {
+                LOGGER.warn("Validation of attribute data types for entity {} failed", entityName);
+                errorsForEntity.get(entityName).stream().forEach(msg -> LOGGER.warn(msg));
+              });
+      throw new InvalidConfigException("Validation attribute data types had errors");
+    }
   }
 
   public JobRunner runJobsForAllEntities(
@@ -121,6 +151,7 @@ public SequencedJobSet getJobSetForEntity(Entity entity) {
 
     jobSet.startNewStage();
     jobSet.addJob(new DenormalizeEntityInstances(entity));
+    jobSet.addJob(new ComputeEntityLevelDisplayHints(entity));
 
     if (entity.getTextSearch().isEnabled() || entity.hasHierarchies()) {
       jobSet.startNewStage();

diff --git a/indexer/src/main/java/bio/terra/tanagra/indexing/Main.java b/indexer/src/main/java/bio/terra/tanagra/indexing/Main.java
@@ -7,14 +7,13 @@
 import bio.terra.tanagra.exception.SystemException;
 import bio.terra.tanagra.indexing.jobexecutor.JobRunner;
 import bio.terra.tanagra.utils.FileIO;
-import bio.terra.tanagra.utils.FileUtils;
 import java.nio.file.Path;
 
 public final class Main {
   private Main() {}
 
   enum Command {
-    EXPAND_CONFIG,
+    VALIDATE_CONFIG,
     INDEX_ENTITY,
     INDEX_ENTITY_GROUP,
     INDEX_ALL,
@@ -37,15 +36,8 @@ public static void main(String... args) throws Exception {
         Indexer.deserializeUnderlay(Path.of(underlayFilePath).getFileName().toString());
 
     switch (cmd) {
-      case EXPAND_CONFIG:
-        String outputDirPath = args[2];
-
-        FileIO.setOutputParentDir(Path.of(outputDirPath));
-        FileUtils.createDirectoryIfNonexistent(FileIO.getOutputParentDir());
-
-        indexer.scanSourceData();
-
-        indexer.getUnderlay().serializeAndWriteToFile();
+      case VALIDATE_CONFIG:
+        indexer.validateConfig();
         break;
       case INDEX_ENTITY:
       case CLEAN_ENTITY: