From 0b13f0a1f025b93d9c63c87f9442d28fa9d3f7f0 Mon Sep 17 00:00:00 2001 From: bijay27bit Date: Tue, 3 Dec 2024 11:28:07 +0000 Subject: [PATCH 1/3] GCS Source additional scenarios. --- .../gcs/source/GCSSourceError.feature | 1 + .../gcs/source/GCSSourceToBigQuery.feature | 310 ++++++++++++++++++ .../common/stepsdesign/TestSetupHooks.java | 7 +- .../resources/pluginParameters.properties | 18 +- .../resources/testdata/GCS_JSON_TEST.json | 1 + 5 files changed, 335 insertions(+), 2 deletions(-) create mode 100755 src/e2e-test/resources/testdata/GCS_JSON_TEST.json diff --git a/src/e2e-test/features/gcs/source/GCSSourceError.feature b/src/e2e-test/features/gcs/source/GCSSourceError.feature index c74f1d7c19..5af6f881b5 100644 --- a/src/e2e-test/features/gcs/source/GCSSourceError.feature +++ b/src/e2e-test/features/gcs/source/GCSSourceError.feature @@ -11,6 +11,7 @@ Feature: GCS source - Verify GCS Source plugin error scenarios | property | | path | | format | + | referenceName | Scenario: To verify Error message for invalid bucket name Given Open Datafusion Project to configure pipeline diff --git a/src/e2e-test/features/gcs/source/GCSSourceToBigQuery.feature b/src/e2e-test/features/gcs/source/GCSSourceToBigQuery.feature index eebb3a1529..2d7d530f7e 100644 --- a/src/e2e-test/features/gcs/source/GCSSourceToBigQuery.feature +++ b/src/e2e-test/features/gcs/source/GCSSourceToBigQuery.feature @@ -285,3 +285,313 @@ Feature: GCS source - Verification of GCS to BQ successful data transfer Then Verify the pipeline status is "Succeeded" Then Get count of no of records transferred to target BigQuery Table Then Validate the values of records transferred from GCS bucket file is equal to the values of target BigQuery table + + @GCS_TSV_TEST @BQ_SINK_TEST + Scenario: To verify successful data transfer from GCS source to BigQuery sink using tsv file format + Given Open Datafusion Project to configure pipeline + When Select plugin: "GCS" from the plugins list as: "Source" + When Expand Plugin group in the LHS plugins list: "Sink" + When Select plugin: "BigQuery" from the plugins list as: "Sink" + Then Connect source as "GCS" and sink as "BigQuery" to establish connection + Then Open GCS source properties + Then Enter GCS property projectId and reference name + Then Override Service account details if set in environment variables + Then Enter GCS source property path "gcsTsvFile" + Then Select GCS property format "tsv" + Then Toggle GCS source property skip header to true + Then Validate output schema with expectedSchema "gcsTsvFileSchema" + Then Validate "GCS" plugin properties + Then Close the GCS properties + Then Open BigQuery sink properties + Then Override Service account details if set in environment variables + Then Enter the BigQuery sink mandatory properties + Then Validate "BigQuery" plugin properties + Then Close the BigQuery properties + Then Save the pipeline + Then Preview and run the pipeline + Then Wait till pipeline preview is in running state + Then Open and capture pipeline preview logs + Then Verify the preview run status of pipeline in the logs is "succeeded" + Then Close the pipeline logs + Then Click on preview data for BigQuery sink + Then Verify preview output schema matches the outputSchema captured in properties + Then Close the preview data + Then Deploy the pipeline + Then Run the Pipeline in Runtime + Then Wait till pipeline is in running state + Then Open and capture logs + Then Verify the pipeline status is "Succeeded" + Then Get count of no of records transferred to target BigQuery Table + Then Validate the values of records transferred from GCS bucket file is equal to the values of target BigQuery table + + @GCS_PARQUET_TEST @BQ_SINK_TEST + Scenario: To verify successful data transfer from GCS source to BigQuery sink using parquet file format + Given Open Datafusion Project to configure pipeline + When Select plugin: "GCS" from the plugins list as: "Source" + When Expand Plugin group in the LHS plugins list: "Sink" + When Select plugin: "BigQuery" from the plugins list as: "Sink" + Then Connect source as "GCS" and sink as "BigQuery" to establish connection + Then Open GCS source properties + Then Enter GCS property projectId and reference name + Then Override Service account details if set in environment variables + Then Enter GCS source property path "gcsParquetFile" + Then Select GCS property format "parquet" + Then Validate output schema with expectedSchema "gcsParquetFileSchema" + Then Validate "GCS" plugin properties + Then Close the GCS properties + Then Open BigQuery sink properties + Then Override Service account details if set in environment variables + Then Enter the BigQuery sink mandatory properties + Then Validate "BigQuery" plugin properties + Then Close the BigQuery properties + Then Save the pipeline + Then Preview and run the pipeline + Then Wait till pipeline preview is in running state + Then Open and capture pipeline preview logs + Then Verify the preview run status of pipeline in the logs is "succeeded" + Then Close the pipeline logs + Then Click on preview data for BigQuery sink + Then Verify preview output schema matches the outputSchema captured in properties + Then Close the preview data + Then Deploy the pipeline + Then Run the Pipeline in Runtime + Then Wait till pipeline is in running state + Then Open and capture logs + Then Verify the pipeline status is "Succeeded" + Then Get count of no of records transferred to target BigQuery Table + Then Validate the values of records transferred from GCS bucket file is equal to the values of target BigQuery table + + @GCS_JSON_TEST @BQ_SINK_TEST + Scenario: To verify successful data transfer from GCS source to BigQuery sink using json file format + Given Open Datafusion Project to configure pipeline + When Select plugin: "GCS" from the plugins list as: "Source" + When Expand Plugin group in the LHS plugins list: "Sink" + When Select plugin: "BigQuery" from the plugins list as: "Sink" + Then Connect source as "GCS" and sink as "BigQuery" to establish connection + Then Open GCS source properties + Then Enter GCS property projectId and reference name + Then Override Service account details if set in environment variables + Then Enter GCS source property path "gcsJsonFile" + Then Select GCS property format "json" + Then Enter GCS source property output schema "outputSchema" as macro argument "OutSchema" + Then Validate "GCS" plugin properties + Then Close the GCS properties + Then Open BigQuery sink properties + Then Override Service account details if set in environment variables + Then Enter the BigQuery sink mandatory properties + Then Validate "BigQuery" plugin properties + Then Close the BigQuery properties + Then Save the pipeline + Then Preview and run the pipeline + Then Enter runtime argument value "gcsJsonFileSchema" for key "OutSchema" + Then Run the preview of pipeline with runtime arguments + Then Wait till pipeline preview is in running state + Then Open and capture pipeline preview logs + Then Verify the preview run status of pipeline in the logs is "succeeded" + Then Close the pipeline logs + Then Close the preview + Then Deploy the pipeline + Then Run the Pipeline in Runtime + Then Enter runtime argument value "gcsJsonFileSchema" for key "OutSchema" + Then Run the Pipeline in Runtime with runtime arguments + Then Wait till pipeline is in running state + Then Open and capture logs + Then Verify the pipeline status is "Succeeded" + Then Get count of no of records transferred to target BigQuery Table + Then Validate the values of records transferred from GCS bucket file is equal to the values of target BigQuery table + + @GCS_CSV_TEST @BQ_SINK_TEST + Scenario: To verify Successful GCS to BigQuery data transfer with enable data file encryption flag true + Given Open Datafusion Project to configure pipeline + When Select plugin: "GCS" from the plugins list as: "Source" + When Expand Plugin group in the LHS plugins list: "Sink" + When Select plugin: "BigQuery" from the plugins list as: "Sink" + Then Connect source as "GCS" and sink as "BigQuery" to establish connection + Then Open GCS source properties + Then Enter GCS property projectId and reference name + Then Override Service account details if set in environment variables + Then Enter GCS source property path "gcsCsvFile" + Then Select GCS property format "csv" + Then Toggle GCS source property skip header to true + Then Validate output schema with expectedSchema "gcsCsvFileSchema" + Then Validate "GCS" plugin properties + Then Select radio button plugin property: "encrypted" with value: "true" + Then Close the GCS properties + Then Open BigQuery sink properties + Then Override Service account details if set in environment variables + Then Enter the BigQuery sink mandatory properties + Then Validate "BigQuery" plugin properties + Then Close the BigQuery properties + Then Save the pipeline + Then Preview and run the pipeline + Then Wait till pipeline preview is in running state + Then Open and capture pipeline preview logs + Then Verify the preview run status of pipeline in the logs is "succeeded" + Then Close the pipeline logs + Then Close the preview + Then Deploy the pipeline + Then Run the Pipeline in Runtime + Then Wait till pipeline is in running state + Then Open and capture logs + Then Verify the pipeline status is "Succeeded" + Then Get count of no of records transferred to target BigQuery Table + Then Validate the values of records transferred from GCS bucket file is equal to the values of target BigQuery table + + @CMEK @GCS_CSV_TEST @BQ_SINK_TEST @BigQuery_Sink_Required + Scenario:To verify successful records transfer from GCS source to BigQuery sink with macro fields enabled at source + Given Open Datafusion Project to configure pipeline + When Select plugin: "GCS" from the plugins list as: "Source" + When Expand Plugin group in the LHS plugins list: "Sink" + When Select plugin: "BigQuery" from the plugins list as: "Sink" + Then Open GCS source properties + Then Enter GCS property reference name + Then Enter GCS property "projectId" as macro argument "gcsProjectId" + Then Enter GCS property "serviceAccountType" as macro argument "serviceAccountType" + Then Enter GCS property "serviceAccountFilePath" as macro argument "serviceAccount" + Then Enter GCS property "serviceAccountJSON" as macro argument "serviceAccount" + Then Enter GCS property "path" as macro argument "gcsSourcePath" + Then Enter GCS property "format" as macro argument "gcsFormat" + Then Enter GCS source property "skipHeader" as macro argument "gcsSkipHeader" + Then Click on the Macro button of Property: "sampleSize" and set the value to: "SampleSize" + Then Click on the Macro button of Property: "override" and set the value to: "OverRide" + Then Click on the Macro button of Property: "minSplitSize" and set the value to: "MinSplit" + Then Click on the Macro button of Property: "maxSplitSize" and set the value to: "MaxSplit" + Then Click on the Macro button of Property: "fileRegex" and set the value to: "FileReg" + Then Click on the Macro button of Property: "filenameOnly" and set the value to: "FilenameOnly" + Then Click on the Macro button of Property: "recursive" and set the value to: "ReadFilesRecursively" + Then Click on the Macro button of Property: "ignoreNonExistingFolders" and set the value to: "IgnoreNonExistingFolders" + Then Click on the Macro button of Property: "encrypted" and set the value to: "DataFileEncrypted" + Then Click on the Macro button of Property: "encryptedMetadataSuffix" and set the value to: "testmeta" + Then Click on the Macro button of Property: "fileSystemProperties" and set the value to: "FileSystemPr" + Then Click on the Macro button of Property: "fileEncoding" and set the value to: "Encode" + Then Enter GCS source property output schema "outputSchema" as macro argument "gcsOutputSchema" + Then Validate "GCS" plugin properties + Then Close the GCS properties + Then Open BigQuery sink properties + Then Enter BigQuery property reference name + Then Enter BigQuery property "projectId" as macro argument "bqProjectId" + Then Enter BigQuery property "datasetProjectId" as macro argument "bqDatasetProjectId" + Then Enter GCS property "serviceAccountType" as macro argument "serviceAccountType" + Then Enter GCS property "serviceAccountFilePath" as macro argument "serviceAccount" + Then Enter GCS property "serviceAccountJSON" as macro argument "serviceAccount" + Then Enter BigQuery property "dataset" as macro argument "bqDataset" + Then Enter BigQuery property "table" as macro argument "bqTargetTable" + Then Enter BigQuery cmek property "encryptionKeyName" as macro argument "cmekBQ" if cmek is enabled + Then Enter BigQuery sink property "truncateTable" as macro argument "bqTruncateTable" + Then Enter BigQuery sink property "updateTableSchema" as macro argument "bqUpdateTableSchema" + Then Validate "BigQuery" plugin properties + Then Close the BigQuery properties + Then Connect source as "GCS" and sink as "BigQuery" to establish connection + Then Save the pipeline + Then Preview and run the pipeline + Then Enter runtime argument value "projectId" for key "gcsProjectId" + Then Enter runtime argument value "serviceAccountType" for key "serviceAccountType" + Then Enter runtime argument value "serviceAccount" for key "serviceAccount" + Then Enter runtime argument value "gcsCsvFile" for GCS source property path key "gcsSourcePath" + Then Enter runtime argument value "gcsSkipHeaderTrue" for key "gcsSkipHeader" + Then Enter runtime argument value "csvFormat" for key "gcsFormat" + Then Enter runtime argument value "sampleSize" for key "SampleSize" + Then Enter runtime argument value "gcsOverrideField" for key "OverRide" + Then Enter runtime argument value "gcsMinSplitSize" for key "MinSplit" + Then Enter runtime argument value "gcsMaxSplitSize" for key "MaxSplit" + Then Enter runtime argument value "fileRegex" for key "FileReg" + Then Enter runtime argument value "filenameOnly" for GCS source property path key "FilenameOnly" + Then Enter runtime argument value "recursive" for GCS source property path key "ReadFilesRecursively" + Then Enter runtime argument value "ignoreNonExistingFolders" for GCS source property path key "IgnoreNonExistingFolders" + Then Enter runtime argument value "encrypted" for GCS source property path key "DataFileEncrypted" + Then Enter runtime argument value "encryptedMetadataSuffix" for GCS source property path key "testmeta" + Then Enter runtime argument value "gcsFileSysProperty" for key "FileSystemPr" + Then Enter runtime argument value "fileEncoding" for key "Encode" + Then Enter runtime argument value "gcsCSVFileOutputSchema" for key "gcsOutputSchema" + Then Enter runtime argument value "projectId" for key "bqProjectId" + Then Enter runtime argument value "projectId" for key "bqDatasetProjectId" + Then Enter runtime argument value "dataset" for key "bqDataset" + Then Enter runtime argument value for BigQuery sink table name key "bqTargetTable" + Then Enter runtime argument value "cmekBQ" for BigQuery cmek property key "cmekBQ" if BQ cmek is enabled + Then Enter runtime argument value "bqTruncateTableTrue" for key "bqTruncateTable" + Then Enter runtime argument value "bqUpdateTableSchemaTrue" for key "bqUpdateTableSchema" + Then Run the preview of pipeline with runtime arguments + Then Wait till pipeline preview is in running state + Then Open and capture pipeline preview logs + Then Verify the preview run status of pipeline in the logs is "succeeded" + Then Close the pipeline logs + Then Close the preview + Then Deploy the pipeline + Then Run the Pipeline in Runtime + Then Enter runtime argument value "projectId" for key "gcsProjectId" + Then Enter runtime argument value "serviceAccountType" for key "serviceAccountType" + Then Enter runtime argument value "serviceAccount" for key "serviceAccount" + Then Enter runtime argument value "gcsCsvFile" for GCS source property path key "gcsSourcePath" + Then Enter runtime argument value "gcsSkipHeaderTrue" for key "gcsSkipHeader" + Then Enter runtime argument value "csvFormat" for key "gcsFormat" + Then Enter runtime argument value "sampleSize" for key "SampleSize" + Then Enter runtime argument value "gcsOverrideField" for key "OverRide" + Then Enter runtime argument value "gcsMinSplitSize" for key "MinSplit" + Then Enter runtime argument value "gcsMaxSplitSize" for key "MaxSplit" + Then Enter runtime argument value "fileRegex" for key "FileReg" + Then Enter runtime argument value "filenameOnly" for GCS source property path key "FilenameOnly" + Then Enter runtime argument value "recursive" for GCS source property path key "ReadFilesRecursively" + Then Enter runtime argument value "ignoreNonExistingFolders" for GCS source property path key "IgnoreNonExistingFolders" + Then Enter runtime argument value "encrypted" for GCS source property path key "DataFileEncrypted" + Then Enter runtime argument value "encryptedMetadataSuffix" for GCS source property path key "testmeta" + Then Enter runtime argument value "gcsFileSysProperty" for key "FileSystemPr" + Then Enter runtime argument value "fileEncoding" for key "Encode" + Then Enter runtime argument value "gcsCSVFileOutputSchema" for key "gcsOutputSchema" + Then Enter runtime argument value "projectId" for key "bqProjectId" + Then Enter runtime argument value "projectId" for key "bqDatasetProjectId" + Then Enter runtime argument value "dataset" for key "bqDataset" + Then Enter runtime argument value for BigQuery sink table name key "bqTargetTable" + Then Enter runtime argument value "cmekBQ" for BigQuery cmek property key "cmekBQ" if BQ cmek is enabled + Then Enter runtime argument value "bqTruncateTableTrue" for key "bqTruncateTable" + Then Enter runtime argument value "bqUpdateTableSchemaTrue" for key "bqUpdateTableSchema" + Then Run the Pipeline in Runtime with runtime arguments + Then Wait till pipeline is in running state + Then Open and capture logs + Then Verify the pipeline status is "Succeeded" + Then Get count of no of records transferred to target BigQuery Table + Then Validate the cmek key "cmekBQ" of target BigQuery table if cmek is enabled + Then Validate the values of records transferred from GCS bucket file is equal to the values of target BigQuery table + + @GCS_OUTPUT_FIELD_TEST @BQ_SINK_TEST + Scenario: To verify successful data transfer from GCS to BigQuery with macro Path Field + Given Open Datafusion Project to configure pipeline + When Select plugin: "GCS" from the plugins list as: "Source" + When Expand Plugin group in the LHS plugins list: "Sink" + When Select plugin: "BigQuery" from the plugins list as: "Sink" + Then Connect source as "GCS" and sink as "BigQuery" to establish connection + Then Open GCS source properties + Then Enter GCS property projectId and reference name + Then Override Service account details if set in environment variables + Then Enter GCS source property path "gcsOutputFieldTestFile" + Then Select GCS property format "csv" + Then Click on the Macro button of Property: "pathField" and set the value to: "PathF" + Then Toggle GCS source property skip header to true + Then Enter GCS source property output schema "outputSchema" as macro argument "gcsOutputSchema" + Then Validate "GCS" plugin properties + Then Close the GCS properties + Then Open BigQuery sink properties + Then Override Service account details if set in environment variables + Then Enter the BigQuery sink mandatory properties + Then Validate "BigQuery" plugin properties + Then Close the BigQuery properties + Then Connect source as "GCS" and sink as "BigQuery" to establish connection + Then Save the pipeline + Then Preview and run the pipeline + Then Enter runtime argument value "gcsPathField" for key "PathF" + Then Enter runtime argument value "gcsPathFieldOutputSchema" for key "gcsOutputSchema" + Then Run the preview of pipeline with runtime arguments + Then Wait till pipeline preview is in running state + Then Open and capture pipeline preview logs + Then Verify the preview run status of pipeline in the logs is "succeeded" + Then Close the pipeline logs + Then Close the preview + Then Deploy the pipeline + Then Run the Pipeline in Runtime + Then Enter runtime argument value "gcsPathField" for key "PathF" + Then Enter runtime argument value "gcsPathFieldOutputSchema" for key "gcsOutputSchema" + Then Run the Pipeline in Runtime with runtime arguments + Then Wait till pipeline is in running state + Then Open and capture logs + Then Verify the pipeline status is "Succeeded" + Then Get count of no of records transferred to target BigQuery Table + Then Verify output field "gcsPathField" in target BigQuery table contains path of the source GcsBucket "gcsOutputFieldTestFile" diff --git a/src/e2e-test/java/io/cdap/plugin/common/stepsdesign/TestSetupHooks.java b/src/e2e-test/java/io/cdap/plugin/common/stepsdesign/TestSetupHooks.java index 8662619b29..cee5d08468 100644 --- a/src/e2e-test/java/io/cdap/plugin/common/stepsdesign/TestSetupHooks.java +++ b/src/e2e-test/java/io/cdap/plugin/common/stepsdesign/TestSetupHooks.java @@ -134,6 +134,11 @@ public static void createBucketWithTSVFile() throws IOException, URISyntaxExcept gcsSourceBucketName = createGCSBucketWithFile(PluginPropertyUtils.pluginProp("gcsTsvFile")); } + @Before(order = 1, value = "@GCS_JSON_TEST") + public static void createBucketWithJSONFile() throws IOException, URISyntaxException { + gcsSourceBucketName = createGCSBucketWithFile(PluginPropertyUtils.pluginProp("gcsJsonFile")); + } + @Before(order = 1, value = "@GCS_BLOB_TEST") public static void createBucketWithBlobFile() throws IOException, URISyntaxException { gcsSourceBucketName = createGCSBucketWithFile(PluginPropertyUtils.pluginProp("gcsBlobFile")); @@ -205,7 +210,7 @@ public static void createBucketWithAvroTestFile() throws IOException, URISyntaxE "or @GCS_DELIMITED_TEST or @GCS_TEXT_TEST or @GCS_OUTPUT_FIELD_TEST or @GCS_DATATYPE_1_TEST or " + "@GCS_DATATYPE_2_TEST or @GCS_READ_RECURSIVE_TEST or @GCS_DELETE_WILDCARD_TEST or @GCS_CSV_RANGE_TEST or" + " @GCS_PARQUET_TEST or @GCS_AVRO_TEST or @GCS_DATATYPE_TEST or @GCS_AVRO_FILE or @GCS_CSV or " + - "GCS_MULTIPLE_FILES_TEST or GCS_MULTIPLE_FILES_REGEX_TEST") + "GCS_MULTIPLE_FILES_TEST or GCS_MULTIPLE_FILES_REGEX_TEST or @GCS_JSON_TEST") public static void deleteSourceBucketWithFile() { deleteGCSBucket(gcsSourceBucketName); PluginPropertyUtils.removePluginProp("gcsSourceBucketName"); diff --git a/src/e2e-test/resources/pluginParameters.properties b/src/e2e-test/resources/pluginParameters.properties index aae33e0e89..4668d7e146 100644 --- a/src/e2e-test/resources/pluginParameters.properties +++ b/src/e2e-test/resources/pluginParameters.properties @@ -45,7 +45,6 @@ gcsWildcardPath2=testdata/GCS_WILDCARD_TEST/wildcard* gcsWildcardPath3=testdata/GCS_WILDCARD_TEST/test* gcsWildcardMultiBucketsPath1=testdata/GCS_RECURSIVE_TEST/*.csv;\ testdata/GCS_RECURSIVE_TEST/recursiveFile2* - gcsOverrideField=id gcsOverrideInt_FloatSchema=[{"key":"id","value":"float"},{"key":"name","value":"string"},\ {"key":"yearofbirth","value":"int"},{"key":"isdeleted","value":"boolean"},{"key":"email","value":"string"},\ @@ -159,6 +158,23 @@ gcsParquetFileSchema=[{"key":"workforce","value":"string"},{"key":"report_year", {"key":"race_black","value":"long"},{"key":"race_hispanic_latinx","value":"long"},\ {"key":"race_native_american","value":"long"},{"key":"race_white","value":"long"},\ {"key":"tablename","value":"string"}] +gcsJsonFile=testdata/GCS_JSON_TEST.json +gcsJsonFileSchema={ "type": "record", "name": "text", "fields": [ { "name":"user", "type": "string" }, { "name": "age", "type": "int" }, { "name": "city", "type": "string" } ] } +sampleSize=1000 +fileRegex=.*\.csv$ +pathField=Employeename +minSplitSize=100 +maxSplitSize=120 +encrypted=false +recursive=false +ignoreNonExistingFolders=false +filenameOnly=false +fileEncoding=UTF-8 +encryptedMetadataSuffix=.metadata +gcsPathFieldOutputSchema={ "type": "record", "name": "text", "fields": [ { "name": "id", "type": "int" }, \ + { "name": "firstname", "type": "string" }, { "name": "lastname", "type": "string" }, \ + { "name": "photo", "type": "string" },{ "name": "outputfield", "type": "string" },\ + { "name": "pathFieldColumn", "type": "string" } ] } ## GCS-PLUGIN-PROPERTIES-END ## BIGQUERY-PLUGIN-PROPERTIES-START diff --git a/src/e2e-test/resources/testdata/GCS_JSON_TEST.json b/src/e2e-test/resources/testdata/GCS_JSON_TEST.json new file mode 100755 index 0000000000..2b7c21e6a3 --- /dev/null +++ b/src/e2e-test/resources/testdata/GCS_JSON_TEST.json @@ -0,0 +1 @@ +{"user": "Alice","age": "25","city": "New York"} From 7c0f4ace5ddb45231595a3228af40b0f2a8ab7e7 Mon Sep 17 00:00:00 2001 From: bijay27bit Date: Wed, 11 Dec 2024 06:14:06 +0000 Subject: [PATCH 2/3] Incorporated 3rd review comments. --- .../gcs/source/GCSSourceToBigQuery.feature | 57 ++----------------- .../resources/pluginParameters.properties | 9 ++- 2 files changed, 10 insertions(+), 56 deletions(-) diff --git a/src/e2e-test/features/gcs/source/GCSSourceToBigQuery.feature b/src/e2e-test/features/gcs/source/GCSSourceToBigQuery.feature index 2d7d530f7e..c6434ff89e 100644 --- a/src/e2e-test/features/gcs/source/GCSSourceToBigQuery.feature +++ b/src/e2e-test/features/gcs/source/GCSSourceToBigQuery.feature @@ -437,7 +437,7 @@ Feature: GCS source - Verification of GCS to BQ successful data transfer Then Get count of no of records transferred to target BigQuery Table Then Validate the values of records transferred from GCS bucket file is equal to the values of target BigQuery table - @CMEK @GCS_CSV_TEST @BQ_SINK_TEST @BigQuery_Sink_Required + @GCS_CSV_TEST @BQ_SINK_TEST @BigQuery_Sink_Required Scenario:To verify successful records transfer from GCS source to BigQuery sink with macro fields enabled at source Given Open Datafusion Project to configure pipeline When Select plugin: "GCS" from the plugins list as: "Source" @@ -457,6 +457,7 @@ Feature: GCS source - Verification of GCS to BQ successful data transfer Then Click on the Macro button of Property: "minSplitSize" and set the value to: "MinSplit" Then Click on the Macro button of Property: "maxSplitSize" and set the value to: "MaxSplit" Then Click on the Macro button of Property: "fileRegex" and set the value to: "FileReg" + Then Click on the Macro button of Property: "pathField" and set the value to: "PathF" Then Click on the Macro button of Property: "filenameOnly" and set the value to: "FilenameOnly" Then Click on the Macro button of Property: "recursive" and set the value to: "ReadFilesRecursively" Then Click on the Macro button of Property: "ignoreNonExistingFolders" and set the value to: "IgnoreNonExistingFolders" @@ -476,7 +477,6 @@ Feature: GCS source - Verification of GCS to BQ successful data transfer Then Enter GCS property "serviceAccountJSON" as macro argument "serviceAccount" Then Enter BigQuery property "dataset" as macro argument "bqDataset" Then Enter BigQuery property "table" as macro argument "bqTargetTable" - Then Enter BigQuery cmek property "encryptionKeyName" as macro argument "cmekBQ" if cmek is enabled Then Enter BigQuery sink property "truncateTable" as macro argument "bqTruncateTable" Then Enter BigQuery sink property "updateTableSchema" as macro argument "bqUpdateTableSchema" Then Validate "BigQuery" plugin properties @@ -495,6 +495,7 @@ Feature: GCS source - Verification of GCS to BQ successful data transfer Then Enter runtime argument value "gcsMinSplitSize" for key "MinSplit" Then Enter runtime argument value "gcsMaxSplitSize" for key "MaxSplit" Then Enter runtime argument value "fileRegex" for key "FileReg" + Then Enter runtime argument value "gcsPathField" for key "PathF" Then Enter runtime argument value "filenameOnly" for GCS source property path key "FilenameOnly" Then Enter runtime argument value "recursive" for GCS source property path key "ReadFilesRecursively" Then Enter runtime argument value "ignoreNonExistingFolders" for GCS source property path key "IgnoreNonExistingFolders" @@ -502,12 +503,11 @@ Feature: GCS source - Verification of GCS to BQ successful data transfer Then Enter runtime argument value "encryptedMetadataSuffix" for GCS source property path key "testmeta" Then Enter runtime argument value "gcsFileSysProperty" for key "FileSystemPr" Then Enter runtime argument value "fileEncoding" for key "Encode" - Then Enter runtime argument value "gcsCSVFileOutputSchema" for key "gcsOutputSchema" + Then Enter runtime argument value "gcsPathFieldOutputSchema" for key "gcsOutputSchema" Then Enter runtime argument value "projectId" for key "bqProjectId" Then Enter runtime argument value "projectId" for key "bqDatasetProjectId" Then Enter runtime argument value "dataset" for key "bqDataset" Then Enter runtime argument value for BigQuery sink table name key "bqTargetTable" - Then Enter runtime argument value "cmekBQ" for BigQuery cmek property key "cmekBQ" if BQ cmek is enabled Then Enter runtime argument value "bqTruncateTableTrue" for key "bqTruncateTable" Then Enter runtime argument value "bqUpdateTableSchemaTrue" for key "bqUpdateTableSchema" Then Run the preview of pipeline with runtime arguments @@ -529,6 +529,7 @@ Feature: GCS source - Verification of GCS to BQ successful data transfer Then Enter runtime argument value "gcsMinSplitSize" for key "MinSplit" Then Enter runtime argument value "gcsMaxSplitSize" for key "MaxSplit" Then Enter runtime argument value "fileRegex" for key "FileReg" + Then Enter runtime argument value "gcsPathField" for key "PathF" Then Enter runtime argument value "filenameOnly" for GCS source property path key "FilenameOnly" Then Enter runtime argument value "recursive" for GCS source property path key "ReadFilesRecursively" Then Enter runtime argument value "ignoreNonExistingFolders" for GCS source property path key "IgnoreNonExistingFolders" @@ -536,12 +537,11 @@ Feature: GCS source - Verification of GCS to BQ successful data transfer Then Enter runtime argument value "encryptedMetadataSuffix" for GCS source property path key "testmeta" Then Enter runtime argument value "gcsFileSysProperty" for key "FileSystemPr" Then Enter runtime argument value "fileEncoding" for key "Encode" - Then Enter runtime argument value "gcsCSVFileOutputSchema" for key "gcsOutputSchema" + Then Enter runtime argument value "gcsPathFieldOutputSchema" for key "gcsOutputSchema" Then Enter runtime argument value "projectId" for key "bqProjectId" Then Enter runtime argument value "projectId" for key "bqDatasetProjectId" Then Enter runtime argument value "dataset" for key "bqDataset" Then Enter runtime argument value for BigQuery sink table name key "bqTargetTable" - Then Enter runtime argument value "cmekBQ" for BigQuery cmek property key "cmekBQ" if BQ cmek is enabled Then Enter runtime argument value "bqTruncateTableTrue" for key "bqTruncateTable" Then Enter runtime argument value "bqUpdateTableSchemaTrue" for key "bqUpdateTableSchema" Then Run the Pipeline in Runtime with runtime arguments @@ -549,49 +549,4 @@ Feature: GCS source - Verification of GCS to BQ successful data transfer Then Open and capture logs Then Verify the pipeline status is "Succeeded" Then Get count of no of records transferred to target BigQuery Table - Then Validate the cmek key "cmekBQ" of target BigQuery table if cmek is enabled Then Validate the values of records transferred from GCS bucket file is equal to the values of target BigQuery table - - @GCS_OUTPUT_FIELD_TEST @BQ_SINK_TEST - Scenario: To verify successful data transfer from GCS to BigQuery with macro Path Field - Given Open Datafusion Project to configure pipeline - When Select plugin: "GCS" from the plugins list as: "Source" - When Expand Plugin group in the LHS plugins list: "Sink" - When Select plugin: "BigQuery" from the plugins list as: "Sink" - Then Connect source as "GCS" and sink as "BigQuery" to establish connection - Then Open GCS source properties - Then Enter GCS property projectId and reference name - Then Override Service account details if set in environment variables - Then Enter GCS source property path "gcsOutputFieldTestFile" - Then Select GCS property format "csv" - Then Click on the Macro button of Property: "pathField" and set the value to: "PathF" - Then Toggle GCS source property skip header to true - Then Enter GCS source property output schema "outputSchema" as macro argument "gcsOutputSchema" - Then Validate "GCS" plugin properties - Then Close the GCS properties - Then Open BigQuery sink properties - Then Override Service account details if set in environment variables - Then Enter the BigQuery sink mandatory properties - Then Validate "BigQuery" plugin properties - Then Close the BigQuery properties - Then Connect source as "GCS" and sink as "BigQuery" to establish connection - Then Save the pipeline - Then Preview and run the pipeline - Then Enter runtime argument value "gcsPathField" for key "PathF" - Then Enter runtime argument value "gcsPathFieldOutputSchema" for key "gcsOutputSchema" - Then Run the preview of pipeline with runtime arguments - Then Wait till pipeline preview is in running state - Then Open and capture pipeline preview logs - Then Verify the preview run status of pipeline in the logs is "succeeded" - Then Close the pipeline logs - Then Close the preview - Then Deploy the pipeline - Then Run the Pipeline in Runtime - Then Enter runtime argument value "gcsPathField" for key "PathF" - Then Enter runtime argument value "gcsPathFieldOutputSchema" for key "gcsOutputSchema" - Then Run the Pipeline in Runtime with runtime arguments - Then Wait till pipeline is in running state - Then Open and capture logs - Then Verify the pipeline status is "Succeeded" - Then Get count of no of records transferred to target BigQuery Table - Then Verify output field "gcsPathField" in target BigQuery table contains path of the source GcsBucket "gcsOutputFieldTestFile" diff --git a/src/e2e-test/resources/pluginParameters.properties b/src/e2e-test/resources/pluginParameters.properties index 4668d7e146..ba9730f1ce 100644 --- a/src/e2e-test/resources/pluginParameters.properties +++ b/src/e2e-test/resources/pluginParameters.properties @@ -13,7 +13,7 @@ gcsCsvFileSchema=[{"key":"EmployeeDepartment","value":"string"},{"key":"Employee {"key":"Salary","value":"int"},{"key":"wotkhours","value":"int"}] gcsCSVFileOutputSchema={ "type": "record", "name": "text", "fields": [ \ { "name": "EmployeeDepartment", "type": "string" }, { "name": "Employeename", "type": "string" }, \ - { "name": "Salary", "type": "int" }, { "name": "wotkhours", "type": "int" } ] } + { "name": "Salary", "type": "int" }, { "name": "wotkhours", "type": "int" }] } gcsTsvFile=testdata/GCS_TSV_TEST.tsv gcsTsvFileSchema=[{"key":"testscenarioid","value":"string"},{"key":"testdescription","value":"string"},\ {"key":"testconditionid","value":"string"},{"key":"testtype","value":"string"}] @@ -171,10 +171,9 @@ ignoreNonExistingFolders=false filenameOnly=false fileEncoding=UTF-8 encryptedMetadataSuffix=.metadata -gcsPathFieldOutputSchema={ "type": "record", "name": "text", "fields": [ { "name": "id", "type": "int" }, \ - { "name": "firstname", "type": "string" }, { "name": "lastname", "type": "string" }, \ - { "name": "photo", "type": "string" },{ "name": "outputfield", "type": "string" },\ - { "name": "pathFieldColumn", "type": "string" } ] } +gcsPathFieldOutputSchema={ "type": "record", "name": "text", "fields": [ \ + { "name": "EmployeeDepartment", "type": "string" }, { "name": "Employeename", "type": "string" }, \ + { "name": "Salary", "type": "int" }, { "name": "wotkhours", "type": "int" }, { "name": "pathFieldColumn", "type": "string" } ] } ## GCS-PLUGIN-PROPERTIES-END ## BIGQUERY-PLUGIN-PROPERTIES-START From b968911947a186ef077bf1246efb9f48b2dedb68 Mon Sep 17 00:00:00 2001 From: bijay27bit Date: Tue, 24 Dec 2024 13:59:43 +0000 Subject: [PATCH 3/3] Added data validation in macro scenario. --- src/e2e-test/features/gcs/source/GCSSourceToBigQuery.feature | 4 ++-- src/e2e-test/resources/pluginParameters.properties | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/e2e-test/features/gcs/source/GCSSourceToBigQuery.feature b/src/e2e-test/features/gcs/source/GCSSourceToBigQuery.feature index c6434ff89e..967afe456c 100644 --- a/src/e2e-test/features/gcs/source/GCSSourceToBigQuery.feature +++ b/src/e2e-test/features/gcs/source/GCSSourceToBigQuery.feature @@ -501,7 +501,7 @@ Feature: GCS source - Verification of GCS to BQ successful data transfer Then Enter runtime argument value "ignoreNonExistingFolders" for GCS source property path key "IgnoreNonExistingFolders" Then Enter runtime argument value "encrypted" for GCS source property path key "DataFileEncrypted" Then Enter runtime argument value "encryptedMetadataSuffix" for GCS source property path key "testmeta" - Then Enter runtime argument value "gcsFileSysProperty" for key "FileSystemPr" + Then Enter runtime argument value "gcsCSVFileSysProperty" for key "FileSystemPr" Then Enter runtime argument value "fileEncoding" for key "Encode" Then Enter runtime argument value "gcsPathFieldOutputSchema" for key "gcsOutputSchema" Then Enter runtime argument value "projectId" for key "bqProjectId" @@ -535,7 +535,7 @@ Feature: GCS source - Verification of GCS to BQ successful data transfer Then Enter runtime argument value "ignoreNonExistingFolders" for GCS source property path key "IgnoreNonExistingFolders" Then Enter runtime argument value "encrypted" for GCS source property path key "DataFileEncrypted" Then Enter runtime argument value "encryptedMetadataSuffix" for GCS source property path key "testmeta" - Then Enter runtime argument value "gcsFileSysProperty" for key "FileSystemPr" + Then Enter runtime argument value "gcsCSVFileSysProperty" for key "FileSystemPr" Then Enter runtime argument value "fileEncoding" for key "Encode" Then Enter runtime argument value "gcsPathFieldOutputSchema" for key "gcsOutputSchema" Then Enter runtime argument value "projectId" for key "bqProjectId" diff --git a/src/e2e-test/resources/pluginParameters.properties b/src/e2e-test/resources/pluginParameters.properties index ba9730f1ce..24be7967f9 100644 --- a/src/e2e-test/resources/pluginParameters.properties +++ b/src/e2e-test/resources/pluginParameters.properties @@ -109,6 +109,7 @@ gcsDataTypeTest2File=testdata/GCS_DATATYPE_TEST_2.csv gcsReadRecursivePath=testdata/GCS_RECURSIVE_TEST gcsReadWildcardPath=testdata/GCS_WILDCARD_TEST,testdata/GCS_WILDCARD_TEST/test gcsFileSysProperty={"textinputformat.record.delimiter": "@"} +gcsCSVFileSysProperty={"csvinputformat.record.csv": "1"} gcsDatatypeChange=[{"key":"createddate","value":"datetime"},{"key":"revenue","value":"double"},\ {"key":"points","value":"decimal"},{"key":"BytesData","value":"bytes"}] gcsDataTypeTestFileSchema=[{"key":"id","value":"int"},{"key":"name","value":"string"},\