DataBiosphere · calypsomatic · Jan 26, 2024 · Jan 9, 2024 · Jan 9, 2024 · Jan 9, 2024
diff --git a/...e/src/main/java/org/databiosphere/workspacedataservice/dataimport/FileDownloadHelper.java b/...e/src/main/java/org/databiosphere/workspacedataservice/dataimport/FileDownloadHelper.java
@@ -0,0 +1,94 @@
+package org.databiosphere.workspacedataservice.dataimport;
+
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.Multimap;
+import java.io.File;
+import java.io.IOException;
+import java.net.URL;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.attribute.PosixFilePermission;
+import java.util.EnumSet;
+import java.util.Set;
+import org.apache.commons.io.FileUtils;
+import org.databiosphere.workspacedataservice.service.model.exception.TdrManifestImportException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.retry.annotation.Backoff;
+import org.springframework.retry.annotation.Retryable;
+
+public class FileDownloadHelper {
+
+  private final DownloadHelper downloadHelper;
+
+  public interface DownloadHelper {
+    default void copyURLToFile(URL sourceUrl, File destinationFile) throws IOException {
+      {
+        FileUtils.copyURLToFile(sourceUrl, destinationFile);
+      }
+    }
+  }
+
+  private final Logger logger = LoggerFactory.getLogger(this.getClass());
+  private final Path tempFileDir;
+  private final Multimap<String, File> fileMap;
+  private final Set<PosixFilePermission> permissions =
+      EnumSet.of(
+          PosixFilePermission.OWNER_READ,
+          PosixFilePermission.GROUP_READ,
+          PosixFilePermission.OTHERS_READ);
+
+  public FileDownloadHelper(String dirName, DownloadHelper downloadHelper) throws IOException {
+    this.tempFileDir = Files.createTempDirectory(dirName);
+    this.downloadHelper = downloadHelper;
+    this.fileMap = HashMultimap.create();
+  }
+
+  public FileDownloadHelper(String dirName) throws IOException {
+    this(
+        dirName,
+        new DownloadHelper() {
+          @Override
+          public void copyURLToFile(URL sourceUrl, File destinationFile) throws IOException {
+            DownloadHelper.super.copyURLToFile(sourceUrl, destinationFile);
+          }
+        });
+  }
+
+  @Retryable(maxAttempts = 3, backoff = @Backoff(delay = 1000))
+  public void downloadFileFromURL(String tableName, URL pathToRemoteFile) {
+    try {
+      File tempFile =
+          File.createTempFile(/* prefix= */ "tdr-", /* suffix= */ "download", tempFileDir.toFile());
+      logger.info("downloading to temp file {} ...", tempFile.getPath());
+      downloadHelper.copyURLToFile(pathToRemoteFile, tempFile);
+      // In the TDR manifest, for Azure snapshots only,
+      // the first file in the list will always be a directory.
+      // Attempting to import that directory
+      // will fail; it has no content. To avoid those failures,
+      // check files for length and ignore any that are empty
+      if (tempFile.length() == 0) {
+        logger.info("Empty file in parquet, skipping");
+        Files.delete(tempFile.toPath());
+      } else {
+        // Once the remote file has been copied to the temp file, make it read-only
+        Files.setPosixFilePermissions(tempFile.toPath(), permissions);
+        fileMap.put(tableName, tempFile);
+      }
+    } catch (IOException e) {
+      throw new TdrManifestImportException(e.getMessage(), e);
+    }
+  }
+
+  public void deleteFileDirectory() {
+    try {
+      Files.delete(tempFileDir);
+    } catch (IOException e) {
+      logger.error("Error deleting temporary files: {}", e.getMessage());
+    }
+  }
+
+  public Multimap<String, File> getFileMap() {
+    return this.fileMap;
+  }
+}
diff --git a/...main/java/org/databiosphere/workspacedataservice/dataimport/tdr/TdrManifestQuartzJob.java b/...main/java/org/databiosphere/workspacedataservice/dataimport/tdr/TdrManifestQuartzJob.java
@@ -20,21 +20,18 @@
 import java.util.Set;
 import java.util.UUID;
 import org.apache.avro.generic.GenericRecord;
-import org.apache.commons.io.FileUtils;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
 import org.apache.parquet.avro.AvroParquetReader;
 import org.apache.parquet.hadoop.ParquetReader;
 import org.apache.parquet.hadoop.util.HadoopInputFile;
 import org.apache.parquet.io.InputFile;
 import org.databiosphere.workspacedataservice.activitylog.ActivityLogger;
 import org.databiosphere.workspacedataservice.dao.JobDao;
+import org.databiosphere.workspacedataservice.dataimport.FileDownloadHelper;
 import org.databiosphere.workspacedataservice.dataimport.WsmSnapshotSupport;
 import org.databiosphere.workspacedataservice.jobexec.JobExecutionException;
 import org.databiosphere.workspacedataservice.jobexec.QuartzJob;
-import org.databiosphere.workspacedataservice.recordstream.TwoPassStreamingWriteHandler;
+import org.databiosphere.workspacedataservice.recordstream.TwoPassStreamingWriteHandler.ImportMode;
 import org.databiosphere.workspacedataservice.retry.RestClientRetry;
 import org.databiosphere.workspacedataservice.service.BatchWriteService;
 import org.databiosphere.workspacedataservice.service.model.BatchWriteResult;
@@ -106,16 +103,19 @@ protected void executeInternal(UUID jobId, JobExecutionContext context) {
     List<TdrManifestImportTable> tdrManifestImportTables =
         extractTableInfo(snapshotExportResponseModel);
 
+    // get all the parquet files from the manifests
+    FileDownloadHelper files = getFilesForImport(tdrManifestImportTables);
+
     // loop through the tables to be imported and upsert base attributes
     var result =
         importTables(
             tdrManifestImportTables,
+            files.getFileMap(),
             targetInstance,
-            TwoPassStreamingWriteHandler.ImportMode.BASE_ATTRIBUTES);
+            ImportMode.BASE_ATTRIBUTES);
 
     // add relations to the existing base attributes
-    importTables(
-        tdrManifestImportTables, targetInstance, TwoPassStreamingWriteHandler.ImportMode.RELATIONS);
+    importTables(tdrManifestImportTables, files.getFileMap(), targetInstance, ImportMode.RELATIONS);
 
     // activity logging for import status
     // no specific activity logging for relations since main import is a superset
@@ -130,61 +130,38 @@ protected void executeInternal(UUID jobId, JobExecutionContext context) {
                           .withRecordType(entry.getKey())
                           .ofQuantity(entry.getValue()));
             });
+    // delete temp files after everything else is completed
+    // Any failed deletions will be removed if/when pod restarts
+    files.deleteFileDirectory();
   }
 
   /**
    * Given a single Parquet file to be imported, import it
    *
-   * @param path path to Parquet file to be imported.
+   * @param inputFile Parquet file to be imported.
    * @param table info about the table to be imported
    * @param targetInstance instance into which to import
    * @param importMode mode for this invocation
    * @return statistics on what was imported
    */
   @VisibleForTesting
   BatchWriteResult importTable(
-      URL path,
+      InputFile inputFile,
       TdrManifestImportTable table,
       UUID targetInstance,
-      TwoPassStreamingWriteHandler.ImportMode importMode) {
-    try {
-      // download the file from the URL to a temp file on the local filesystem
-      // Azure urls, with SAS tokens, don't need any particular auth.
-      // TODO AJ-1517 can we access the URL directly, no temp file?
-      File tempFile = File.createTempFile("tdr-", "download");
-      logger.info("downloading to temp file {} ...", tempFile.getPath());
-      FileUtils.copyURLToFile(path, tempFile);
-      Path hadoopFilePath = new Path(tempFile.getPath());
-      // do we need any other config here?
-      Configuration configuration = new Configuration();
-
-      // In the TDR manifest, for Azure snapshots only,
-      // the first file in the list will always be a directory. Attempting to import that directory
-      // will fail; it has no content. To avoid those failures,
-      // check files for length and ignore any that are empty
-      FileSystem fileSystem = FileSystem.get(configuration);
-      FileStatus fileStatus = fileSystem.getFileStatus(hadoopFilePath);
-      if (fileStatus.getLen() == 0) {
-        logger.info("Empty file in parquet, skipping");
-        return BatchWriteResult.empty();
-      }
-
-      // generate the HadoopInputFile
-      InputFile inputFile = HadoopInputFile.fromPath(hadoopFilePath, configuration);
-
-      // upsert this parquet file's contents
-      try (ParquetReader<GenericRecord> avroParquetReader =
-          AvroParquetReader.<GenericRecord>builder(inputFile)
-              .set(READ_INT96_AS_FIXED, "true")
-              .build()) {
-        logger.info("batch-writing records for file ...");
-
-        BatchWriteResult result =
-            batchWriteService.batchWriteParquetStream(
-                avroParquetReader, targetInstance, table, importMode);
-
-        return result;
-      }
+      ImportMode importMode) {
+    // upsert this parquet file's contents
+    try (ParquetReader<GenericRecord> avroParquetReader =
+        AvroParquetReader.<GenericRecord>builder(inputFile)
+            .set(READ_INT96_AS_FIXED, "true")
+            .build()) {
+      logger.info("batch-writing records for file ...");
+
+      BatchWriteResult result =
+          batchWriteService.batchWriteParquetStream(
+              avroParquetReader, targetInstance, table, importMode);
+
+      return result;
     } catch (Throwable t) {
       logger.error("Hit an error on file: {}", t.getMessage());
       throw new TdrManifestImportException(t.getMessage());
@@ -200,36 +177,77 @@ BatchWriteResult importTable(
    */
   private BatchWriteResult importTables(
       List<TdrManifestImportTable> importTables,
+      Multimap<String, File> fileMap,
       UUID targetInstance,
-      TwoPassStreamingWriteHandler.ImportMode importMode) {
+      ImportMode importMode) {
 
     var combinedResult = BatchWriteResult.empty();
     // loop through the tables that have data files.
     importTables.forEach(
         importTable -> {
           logger.info("Processing table '{}' ...", importTable.recordType().getName());
 
-          // find all Parquet files for this table
-          List<URL> paths = importTable.dataFiles();
-          logger.debug(
-              "Table '{}' has {} export file(s) ...",
-              importTable.recordType().getName(),
-              paths.size());
-
           // loop through each parquet file
-          paths.forEach(
-              path -> {
-                var result = importTable(path, importTable, targetInstance, importMode);
-
-                if (result != null) {
-                  combinedResult.merge(result);
-                }
-              });
+          fileMap
+              .get(importTable.recordType().getName())
+              .forEach(
+                  file -> {
+                    try {
+                      org.apache.hadoop.fs.Path hadoopFilePath =
+                          new org.apache.hadoop.fs.Path(file.toString());
+                      Configuration configuration = new Configuration();
+
+                      // generate the HadoopInputFile
+                      InputFile inputFile = HadoopInputFile.fromPath(hadoopFilePath, configuration);
+                      var result = importTable(inputFile, importTable, targetInstance, importMode);
+                      if (result != null) {
+                        combinedResult.merge(result);
+                      }
+                    } catch (IOException e) {
+                      throw new TdrManifestImportException(e.getMessage(), e);
+                    }
+                  });
         });
-
     return combinedResult;
   }
 
+  /**
+   * Given the list of tables/data files to be imported, loop through and download each one to a
+   * temporary file
+   *
+   * @param importTables tables to be imported
+   * @return path for the directory where downloaded files are located
+   */
+  @VisibleForTesting
+  FileDownloadHelper getFilesForImport(List<TdrManifestImportTable> importTables) {
+    try {
+      FileDownloadHelper files = new FileDownloadHelper("tempParquetDir");
+
+      // loop through the tables that have data files.
+      importTables.forEach(
+          importTable -> {
+            logger.info("Fetching files for table '{}' ...", importTable.recordType().getName());
+
+            // find all Parquet files for this table
+            List<URL> paths = importTable.dataFiles();
+            logger.debug(
+                "Table '{}' has {} export file(s) ...",
+                importTable.recordType().getName(),
+                paths.size());
+
+            // loop through each parquet file
+            paths.forEach(
+                path -> {
+                  files.downloadFileFromURL(importTable.recordType().getName(), path);
+                });
+          });
+
+      return files;
+    } catch (IOException e) {
+      throw new TdrManifestImportException("Error downloading temporary files", e);
+    }
+  }
+
   /**
    * Read the manifest from the user-specified URL into a SnapshotExportResponseModel java object
    *

diff --git a/...c/test/java/org/databiosphere/workspacedataservice/dataimport/FileDownloadHelperTest.java b/...c/test/java/org/databiosphere/workspacedataservice/dataimport/FileDownloadHelperTest.java
@@ -0,0 +1,67 @@
+package org.databiosphere.workspacedataservice.dataimport;
+
+import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
+import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.Mockito.doThrow;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URL;
+import org.junit.jupiter.api.Test;
+import org.mockito.Mockito;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.boot.test.context.SpringBootTest;
+import org.springframework.core.io.Resource;
+import org.springframework.retry.backoff.FixedBackOffPolicy;
+import org.springframework.retry.support.RetryTemplate;
+
+@SpringBootTest
+public class FileDownloadHelperTest {
+
+  @Value("classpath:parquet/empty.parquet")
+  Resource emptyParquet;
+
+  @Value("classpath:parquet/v2f/all_data_types.parquet")
+  Resource allDataTypesParquet;
+
+  @Test
+  void downloadEmptyFile() throws IOException {
+    FileDownloadHelper helper = new FileDownloadHelper("test");
+    assertDoesNotThrow(() -> helper.downloadFileFromURL("empty_table", emptyParquet.getURL()));
+    assert helper.getFileMap().isEmpty();
+  }
+
+  @Test
+  void testRetry() throws Exception {
+    FileDownloadHelper.DownloadHelper mockDownloadHelper =
+        Mockito.mock(FileDownloadHelper.DownloadHelper.class);
+    doThrow(new IOException("Simulated connection error"))
+        .doCallRealMethod() // Succeed on the second attempt
+        .when(mockDownloadHelper)
+        .copyURLToFile(any(URL.class), any(File.class));
+
+    // Create a RetryTemplate to set off Spring's retryable
+    RetryTemplate retryTemplate = new RetryTemplate();
+    retryTemplate.setBackOffPolicy(new FixedBackOffPolicy());
+
+    FileDownloadHelper helper = new FileDownloadHelper("test", mockDownloadHelper);
+
+    // A single connectivity error should not throw
+    assertDoesNotThrow(
+        () ->
+            retryTemplate.execute(
+                context -> {
+                  helper.downloadFileFromURL("table", allDataTypesParquet.getURL());
+                  return null;
+                }));
+
+    // Make sure there actually was a connectivity problem
+    verify(mockDownloadHelper, times(2)).copyURLToFile(any(URL.class), any(File.class));
+
+    // File should successfully download on second attempt
+    assert helper.getFileMap().containsKey("table");
+    assert helper.getFileMap().get("table").size() == 1;
+  }
+}