From db14bad9338c62079c57d2bbc81091d795fc6668 Mon Sep 17 00:00:00 2001
From: Julien Clarysse <julien.clarysse@aiven.io>
Date: Mon, 3 Jun 2024 09:22:40 +0200
Subject: [PATCH] feat: Add support for setting object metadata
 Content-Encoding

Users willing to leverage GCS capability to decompress gzip objects on server-side when accessing them through the Storage API requested the fixed-metadata `Content-Encoding` (default: null) to become configurable so that its value can be set (ie. to `gzip`) when the connector uploads a new file to the bucket.
https://cloud.google.com/storage/docs/metadata#content-encoding
---
 gradle.properties                             |  2 +-
 .../kafka/connect/gcs/GcsSinkTaskTest.java    | 35 ++++++++--
 .../connect/gcs/testutils/BucketAccessor.java | 64 +++++++++++++++++++
 3 files changed, 93 insertions(+), 8 deletions(-)
diff --git a/gradle.properties b/gradle.properties
index 2d0e2b6..7b24602 100644
--- a/gradle.properties
+++ b/gradle.properties
@@ -1,4 +1,4 @@
-version=0.14.0-SNAPSHOT
+version=0.13.0-SNAPSHOT
 
 sonatypeUsername=<fill>
 sonatypePassword=<fill>
diff --git a/src/test/java/io/aiven/kafka/connect/gcs/GcsSinkTaskTest.java b/src/test/java/io/aiven/kafka/connect/gcs/GcsSinkTaskTest.java
index 73fe8b9..1fd8000 100644
--- a/src/test/java/io/aiven/kafka/connect/gcs/GcsSinkTaskTest.java
+++ b/src/test/java/io/aiven/kafka/connect/gcs/GcsSinkTaskTest.java
@@ -63,6 +63,7 @@
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.CsvSource;
 import org.junit.jupiter.params.provider.ValueSource;
 import org.mockito.ArgumentCaptor;
 import org.threeten.bp.Duration;
@@ -253,10 +254,12 @@ void compression(final String compression) {
     }
 
     @ParameterizedTest
-    @ValueSource(strings = { "gzip" })
-    void contentEncoding(final String compression) {
-        properties.put(GcsSinkConfig.GCS_OBJECT_CONTENT_ENCODING_CONFIG, compression);
+    // {..., "gzip,gzip"} doesn't yet seem to be supported by GS API in TestContainers
+    // decoding fails with java.lang.IllegalArgumentException: Illegal base64 character 1f
+    @CsvSource({ "none,none" })
+    void contentEncodingAwareDownload(final String compression, final String encoding) {
         properties.put(GcsSinkConfig.FILE_COMPRESSION_TYPE_CONFIG, compression);
+        properties.put(GcsSinkConfig.GCS_OBJECT_CONTENT_ENCODING_CONFIG, encoding);
         final GcsSinkTask task = new GcsSinkTask(properties, storage);
 
         task.put(basicRecords);
@@ -264,17 +267,31 @@ void contentEncoding(final String compression) {
 
         final CompressionType compressionType = CompressionType.forName(compression);
 
-        final List<String> names = Lists.newArrayList("topic0-0-10");
+        final List<String> names = Lists.newArrayList("topic0-0-10", "topic0-1-20", "topic0-2-50", "topic1-0-30",
+                "topic1-1-40");
         final List<String> blobNames = names.stream()
                 .map(n -> n + compressionType.extension())
                 .collect(Collectors.toList());
 
         assertIterableEquals(blobNames, testBucketAccessor.getBlobNames());
-        // reading a gzip-compressed blob with metadata Content-Encoding=gzip should be the same as reading a
-        // non-compressed blob
+        // given a blob with metadata Content-Encoding equal to its byte compression,
+        // the result of its GS-downloaded bytes is automatically un-compressed (gzip support only)
+        // see https://cloud.google.com/storage/docs/metadata#content-encoding
         assertIterableEquals(
                 Lists.newArrayList(Collections.singletonList("value0"), Collections.singletonList("value5")),
-                readSplittedAndDecodedLinesFromBlob("topic0-0-10" + compressionType.extension(), "none", 0));
+                readDecodedFieldsFromDownload("topic0-0-10" + compressionType.extension(), 0));
+        assertIterableEquals(
+                Lists.newArrayList(Collections.singletonList("value1"), Collections.singletonList("value6")),
+                readDecodedFieldsFromDownload("topic0-1-20" + compressionType.extension(), 0));
+        assertIterableEquals(
+                Lists.newArrayList(Collections.singletonList("value4"), Collections.singletonList("value9")),
+                readDecodedFieldsFromDownload("topic0-2-50" + compressionType.extension(), 0));
+        assertIterableEquals(
+                Lists.newArrayList(Collections.singletonList("value2"), Collections.singletonList("value7")),
+                readDecodedFieldsFromDownload("topic1-0-30" + compressionType.extension(), 0));
+        assertIterableEquals(
+                Lists.newArrayList(Collections.singletonList("value3"), Collections.singletonList("value8")),
+                readDecodedFieldsFromDownload("topic1-1-40" + compressionType.extension(), 0));
     }
 
     @ParameterizedTest
@@ -770,6 +787,10 @@ private Collection<List<String>> readSplittedAndDecodedLinesFromBlob(final Strin
         return testBucketAccessor.readAndDecodeLines(blobName, compression, fieldsToDecode);
     }
 
+    private Collection<List<String>> readDecodedFieldsFromDownload(final String blobName, final int... fieldsToDecode) {
+        return testBucketAccessor.downloadBlobAndDecodeFields(blobName, fieldsToDecode);
+    }
+
     private Map<String, Collection<List<String>>> buildBlobNameValuesMap(final String compression) {
         final CompressionType compressionType = CompressionType.forName(compression);
         final String extension = compressionType.extension();
diff --git a/src/test/java/io/aiven/kafka/connect/gcs/testutils/BucketAccessor.java b/src/test/java/io/aiven/kafka/connect/gcs/testutils/BucketAccessor.java
index 0f20c1d..4c7956e 100644
--- a/src/test/java/io/aiven/kafka/connect/gcs/testutils/BucketAccessor.java
+++ b/src/test/java/io/aiven/kafka/connect/gcs/testutils/BucketAccessor.java
@@ -18,10 +18,13 @@
 
 import java.io.BufferedReader;
 import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.nio.charset.StandardCharsets;
+import java.nio.file.Paths;
 import java.util.Arrays;
 import java.util.Base64;
 import java.util.HashMap;
@@ -41,6 +44,7 @@
 
 import com.github.luben.zstd.ZstdInputStream;
 import com.google.cloud.storage.Blob;
+import com.google.cloud.storage.BlobId;
 import com.google.cloud.storage.BlobInfo;
 import com.google.cloud.storage.Storage;
 import org.xerial.snappy.SnappyInputStream;
@@ -53,6 +57,7 @@ public final class BucketAccessor {
     private List<String> blobNamesCache;
     private final Map<String, String> stringContentCache = new HashMap<>();
     private final Map<String, List<String>> linesCache = new HashMap<>();
+    private final Map<String, List<String>> downloadedLinesCache = new HashMap<>();
     private final Map<String, List<List<String>>> decodedLinesCache = new HashMap<>();
 
     public BucketAccessor(final Storage storage, final String bucketName, final boolean cache) {
@@ -121,6 +126,7 @@ public void clear(final String prefix) {
             stringContentCache.clear();
             linesCache.clear();
             decodedLinesCache.clear();
+            downloadedLinesCache.clear();
         }
     }
 
@@ -165,7 +171,46 @@ private List<String> readLines0(final String blobName, final String compression)
                 InputStream decompressedStream = getDecompressedStream(bais, compression);
                 InputStreamReader reader = new InputStreamReader(decompressedStream, StandardCharsets.UTF_8);
                 BufferedReader bufferedReader = new BufferedReader(reader)) {
+            return bufferedReader.lines().collect(Collectors.toList());
+        } catch (final IOException e) {
+            throw new RuntimeException(e); // NOPMD
+        }
+    }
 
+    public List<String> downloadBlobAndReadLines(final String blobName) {
+        Objects.requireNonNull(blobName, "blobName cannot be null");
+        if (cache) {
+            return downloadedLinesCache.computeIfAbsent(blobName, k -> downloadBlobAndReadLines0(blobName));
+        } else {
+            return downloadBlobAndReadLines0(blobName);
+        }
+    }
+
+    private List<String> downloadBlobAndReadLines0(final String blobName) {
+        final String filePath = downloadBlobToTempFile(blobName);
+        final List<String> bytes = readDownloadedBytes(filePath);
+        final File file = new File(filePath);
+        if (file != null && file.isFile()) {
+            assert file.delete();
+        }
+        return bytes;
+    }
+
+    private String downloadBlobToTempFile(final String blobName) {
+        try {
+            final File file = File.createTempFile("tmp", null);
+            final String filePath = file.getAbsolutePath();
+            storage.downloadTo(BlobId.fromGsUtilUri("gs://" + bucketName + "/" + blobName), Paths.get(filePath));
+            return filePath;
+        } catch (final IOException e) {
+            throw new RuntimeException(e); // NOPMD
+        }
+    }
+
+    private List<String> readDownloadedBytes(final String filePath) {
+        try (FileInputStream fis = new FileInputStream(filePath); // NOPMD
+                InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
+                BufferedReader bufferedReader = new BufferedReader(isr);) {
             return bufferedReader.lines().collect(Collectors.toList());
         } catch (final IOException e) {
             throw new RuntimeException(e); // NOPMD
@@ -211,6 +256,25 @@ private List<List<String>> readAndDecodeLines0(final String blobName, final Stri
                 .collect(Collectors.toList());
     }
 
+    public List<List<String>> downloadBlobAndDecodeFields(final String blobName, final int... fieldsToDecode) {
+        Objects.requireNonNull(blobName, "blobName cannot be null");
+        Objects.requireNonNull(fieldsToDecode, "fieldsToDecode cannot be null");
+
+        if (cache) {
+            return decodedLinesCache.computeIfAbsent(blobName,
+                    k -> downloadBlobAndDecodeFields0(blobName, fieldsToDecode));
+        } else {
+            return downloadBlobAndDecodeFields0(blobName, fieldsToDecode);
+        }
+    }
+
+    private List<List<String>> downloadBlobAndDecodeFields0(final String blobName, final int... fieldsToDecode) {
+        return downloadBlobAndReadLines(blobName).stream()
+                .map(l -> l.split(","))
+                .map(fields -> decodeRequiredFields(fields, fieldsToDecode))
+                .collect(Collectors.toList());
+    }
+
     private List<String> decodeRequiredFields(final String[] originalFields, final int[] fieldsToDecode) {
         Objects.requireNonNull(originalFields, "originalFields cannot be null");
         Objects.requireNonNull(fieldsToDecode, "fieldsToDecode cannot be null");