diff --git a/core-plugins/docs/File-batchsource.md b/core-plugins/docs/File-batchsource.md index 5297278cf..c3b0b6468 100644 --- a/core-plugins/docs/File-batchsource.md +++ b/core-plugins/docs/File-batchsource.md @@ -15,14 +15,14 @@ Properties **Path:** Path to read from. For example, s3a:///path/to/input **Format:** Format of the data to read. -The format must be one of 'avro', 'blob', 'csv', 'delimited', 'json', 'parquet', 'text', 'tsv', or the +The format must be one of 'avro', 'blob', 'csv', 'delimited', 'json', 'parquet', 'text', 'tsv', 'xls', or the name of any format plugin that you have deployed to your environment. If the format is a macro, only the pre-packaged formats can be used. If the format is 'blob', every input file will be read into a separate record. The 'blob' format also requires a schema that contains a field named 'body' of type 'bytes'. If the format is 'text', the schema must contain a field named 'body' of type 'string'. -**Get Schema:** Auto-detects schema from file. Supported formats are: avro, parquet, csv, delimited, tsv, blob +**Get Schema:** Auto-detects schema from file. Supported formats are: avro, parquet, csv, delimited, tsv, blob, xls and text. Blob - is set by default as field named 'body' of type bytes. @@ -42,9 +42,16 @@ If no such file can be found, an error will be returned. **Sample Size:** The maximum number of rows in a file that will get investigated for automatic data type detection. +**Terminate If Empty Row:** Specify whether to stop reading after encountering the first empty row. Defaults to false. + +**Select Sheet Using:** Select the sheet by name or number. Default is 'Sheet Number'. + +**Sheet Value:** The name/number of the sheet to read from. If not specified, the first sheet will be read. +Sheet Number are 0 based, ie first sheet is 0. + **Delimiter:** Delimiter to use when the format is 'delimited'. This will be ignored for other formats. -**Use First Row as Header:** Whether to use the first line of each file as the column headers. Supported formats are 'text', 'csv', 'tsv', 'delimited'. +**Use First Row as Header:** Whether to use the first line of each file as the column headers. Supported formats are 'text', 'csv', 'tsv', 'xls', 'delimited'. **Enable Quoted Values** Whether to treat content between quotes as a value. This value will only be used if the format is 'csv', 'tsv' or 'delimited'. For example, if this is set to true, a line that looks like `1, "a, b, c"` will output two fields. diff --git a/core-plugins/src/main/java/io/cdap/plugin/batch/source/FileSourceConfig.java b/core-plugins/src/main/java/io/cdap/plugin/batch/source/FileSourceConfig.java index 9eda2b3c1..3082bb353 100644 --- a/core-plugins/src/main/java/io/cdap/plugin/batch/source/FileSourceConfig.java +++ b/core-plugins/src/main/java/io/cdap/plugin/batch/source/FileSourceConfig.java @@ -20,6 +20,7 @@ import com.google.gson.reflect.TypeToken; import io.cdap.cdap.api.annotation.Description; import io.cdap.cdap.api.annotation.Macro; +import io.cdap.cdap.api.annotation.Name; import io.cdap.cdap.api.data.schema.Schema; import io.cdap.cdap.etl.api.FailureCollector; import io.cdap.plugin.format.FileFormat; @@ -37,6 +38,9 @@ public class FileSourceConfig extends AbstractFileSourceConfig { public static final String NAME_FILE_SYSTEM_PROPERTIES = "fileSystemProperties"; public static final String NAME_PATH = "path"; public static final String NAME_FILE_ENCODING = "fileEncoding"; + public static final String NAME_SHEET = "sheet"; + public static final String NAME_SHEET_VALUE = "sheetValue"; + public static final String NAME_TERMINATE_IF_EMPTY_ROW = "terminateIfEmptyRow"; private static final Gson GSON = new Gson(); private static final Type MAP_STRING_STRING_TYPE = new TypeToken>() { }.getType(); @@ -64,6 +68,25 @@ public class FileSourceConfig extends AbstractFileSourceConfig { @Nullable @Description("The maximum number of rows that will get investigated for automatic data type detection.") private Long sampleSize; + + @Name(NAME_SHEET) + @Macro + @Nullable + @Description("Select the sheet by name or number. Default is 'Sheet Number'.") + private String sheet; + + @Name(NAME_SHEET_VALUE) + @Macro + @Nullable + @Description("The name/number of the sheet to read from. If not specified, the first sheet will be read." + + "Sheet Number are 0 based, ie first sheet is 0.") + private String sheetValue; + + @Name(NAME_TERMINATE_IF_EMPTY_ROW) + @Macro + @Nullable + @Description("Specify whether to stop reading after encountering the first empty row. Defaults to false.") + private String terminateIfEmptyRow; FileSourceConfig() { super(); diff --git a/core-plugins/widgets/File-batchsource.json b/core-plugins/widgets/File-batchsource.json index 3ab0704cc..604571d85 100644 --- a/core-plugins/widgets/File-batchsource.json +++ b/core-plugins/widgets/File-batchsource.json @@ -101,6 +101,42 @@ "label": "False" } } + }, + { + "widget-type": "toggle", + "label": "Terminate If Empty Row", + "name": "terminateIfEmptyRow", + "widget-attributes": { + "default": "false", + "on": { + "value": "true", + "label": "True" + }, + "off": { + "value": "false", + "label": "False" + } + } + }, + { + "widget-type": "select", + "label": "Select Sheet Using", + "name": "sheet", + "widget-attributes": { + "values": [ + "Sheet Name", + "Sheet Number" + ], + "default": "Sheet Number" + } + }, + { + "widget-type": "textbox", + "label": "Sheet Value", + "name": "sheetValue", + "widget-attributes": { + "default": "0" + } } ] }, @@ -499,13 +535,46 @@ { "name": "skipHeader", "condition": { - "expression": "format == 'delimited' || format == 'csv' || format == 'tsv'" + "expression": "format == 'delimited' || format == 'csv' || format == 'tsv' || format == 'xls'" }, "show": [ { "name": "skipHeader" } ] + }, + { + "name": "sheet", + "condition": { + "expression": "format == 'xls'" + }, + "show": [ + { + "name": "sheet" + } + ] + }, + { + "name": "sheetValue", + "condition": { + "expression": "format == 'xls'" + }, + "show": [ + { + "name": "sheetValue" + } + ] + }, + { + "name": "terminateIfEmptyRow", + "condition": { + "expression": "format == 'xls'" + }, + "show": [ + { + "name": "terminateIfEmptyRow" + } + ] } ], "outputs": [