feat(python): Expose top-level "has_header" param in read_excel acr…

…oss all supported engines
pola-rs · Aug 13, 2024 · c5d3b87 · c5d3b87
1 parent fac700d
commit c5d3b87
Show file tree

Hide file tree

Showing 2 changed files with 145 additions and 59 deletions.
diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py
@@ -50,6 +50,7 @@ def read_excel(
     engine: ExcelSpreadsheetEngine = ...,
     engine_options: dict[str, Any] | None = ...,
     read_options: dict[str, Any] | None = ...,
+    has_header: bool = ...,
     columns: Sequence[int] | Sequence[str] | None = ...,
     schema_overrides: SchemaDict | None = ...,
     infer_schema_length: int | None = ...,
@@ -65,6 +66,7 @@ def read_excel(
     sheet_name: None = ...,
     engine: ExcelSpreadsheetEngine = ...,
     engine_options: dict[str, Any] | None = ...,
+    has_header: bool = ...,
     read_options: dict[str, Any] | None = ...,
     columns: Sequence[int] | Sequence[str] | None = ...,
     schema_overrides: SchemaDict | None = ...,
@@ -82,6 +84,7 @@ def read_excel(
     engine: ExcelSpreadsheetEngine = ...,
     engine_options: dict[str, Any] | None = ...,
     read_options: dict[str, Any] | None = ...,
+    has_header: bool = ...,
     columns: Sequence[int] | Sequence[str] | None = ...,
     schema_overrides: SchemaDict | None = ...,
     infer_schema_length: int | None = ...,
@@ -100,6 +103,7 @@ def read_excel(
     engine: ExcelSpreadsheetEngine = ...,
     engine_options: dict[str, Any] | None = ...,
     read_options: dict[str, Any] | None = ...,
+    has_header: bool = ...,
     columns: Sequence[int] | Sequence[str] | None = ...,
     schema_overrides: SchemaDict | None = ...,
     infer_schema_length: int | None = ...,
@@ -116,6 +120,7 @@ def read_excel(
     engine: ExcelSpreadsheetEngine = ...,
     engine_options: dict[str, Any] | None = ...,
     read_options: dict[str, Any] | None = ...,
+    has_header: bool = ...,
     columns: Sequence[int] | Sequence[str] | None = ...,
     schema_overrides: SchemaDict | None = ...,
     infer_schema_length: int | None = ...,
@@ -132,6 +137,7 @@ def read_excel(
     engine: ExcelSpreadsheetEngine = ...,
     engine_options: dict[str, Any] | None = ...,
     read_options: dict[str, Any] | None = ...,
+    has_header: bool = ...,
     columns: Sequence[int] | Sequence[str] | None = ...,
     schema_overrides: SchemaDict | None = ...,
     infer_schema_length: int | None = ...,
@@ -149,6 +155,7 @@ def read_excel(
     engine: ExcelSpreadsheetEngine = "calamine",
     engine_options: dict[str, Any] | None = None,
     read_options: dict[str, Any] | None = None,
+    has_header: bool = True,
     columns: Sequence[int] | Sequence[str] | None = None,
     schema_overrides: SchemaDict | None = None,
     infer_schema_length: int | None = N_INFER_DEFAULT,
@@ -207,6 +214,10 @@ def read_excel(
         * "calamine": `ExcelReader.load_sheet_by_name`
         * "xlsx2csv": `pl.read_csv`
         * "openpyxl": n/a (can only provide `engine_options`)
+    has_header
+        Indicate if the first row of the table data is a header or not. If False,
+        column names will be autogenerated in the following format: `column_x`, with
+        `x` being an enumeration over every column in the dataset, starting at 1.
     columns
         Columns to read from the sheet; if not specified, all columns are read. Can
         be given as a sequence of column names or indices.
@@ -285,6 +296,7 @@ def read_excel(
         schema_overrides=schema_overrides,
         infer_schema_length=infer_schema_length,
         raise_if_empty=raise_if_empty,
+        has_header=has_header,
         columns=columns,
     )
 
@@ -295,6 +307,7 @@ def read_ods(
     *,
     sheet_id: None = ...,
     sheet_name: str,
+    has_header: bool = ...,
     columns: Sequence[int] | Sequence[str] | None = ...,
     schema_overrides: SchemaDict | None = ...,
     infer_schema_length: int | None = ...,
@@ -308,6 +321,7 @@ def read_ods(
     *,
     sheet_id: None = ...,
     sheet_name: None = ...,
+    has_header: bool = ...,
     columns: Sequence[int] | Sequence[str] | None = ...,
     schema_overrides: SchemaDict | None = ...,
     infer_schema_length: int | None = ...,
@@ -321,6 +335,7 @@ def read_ods(
     *,
     sheet_id: int,
     sheet_name: str,
+    has_header: bool = ...,
     columns: Sequence[int] | Sequence[str] | None = ...,
     schema_overrides: SchemaDict | None = ...,
     infer_schema_length: int | None = ...,
@@ -334,6 +349,7 @@ def read_ods(
     *,
     sheet_id: Literal[0] | Sequence[int],
     sheet_name: None = ...,
+    has_header: bool = ...,
     columns: Sequence[int] | Sequence[str] | None = ...,
     schema_overrides: SchemaDict | None = ...,
     infer_schema_length: int | None = ...,
@@ -347,6 +363,7 @@ def read_ods(
     *,
     sheet_id: int,
     sheet_name: None = ...,
+    has_header: bool = ...,
     columns: Sequence[int] | Sequence[str] | None = ...,
     schema_overrides: SchemaDict | None = ...,
     infer_schema_length: int | None = ...,
@@ -360,6 +377,7 @@ def read_ods(
     *,
     sheet_id: None,
     sheet_name: list[str] | tuple[str],
+    has_header: bool = ...,
     columns: Sequence[int] | Sequence[str] | None = ...,
     schema_overrides: SchemaDict | None = ...,
     infer_schema_length: int | None = ...,
@@ -372,6 +390,7 @@ def read_ods(
     *,
     sheet_id: int | Sequence[int] | None = None,
     sheet_name: str | list[str] | tuple[str] | None = None,
+    has_header: bool = True,
     columns: Sequence[int] | Sequence[str] | None = None,
     schema_overrides: SchemaDict | None = None,
     infer_schema_length: int | None = N_INFER_DEFAULT,
@@ -396,6 +415,10 @@ def read_ods(
     sheet_name
         Sheet name(s) to convert; cannot be used in conjunction with `sheet_id`. If
         more than one is given then a `{sheetname:frame,}` dict is returned.
+    has_header
+        Indicate if the first row of the table data is a header or not. If False,
+        column names will be autogenerated in the following format: `column_x`, with
+        `x` being an enumeration over every column in the dataset, starting at 1.
     columns
         Columns to read from the sheet; if not specified, all columns are read. Can
         be given as a sequence of column names or indices.
@@ -446,6 +469,7 @@ def read_ods(
         schema_overrides=schema_overrides,
         infer_schema_length=infer_schema_length,
         raise_if_empty=raise_if_empty,
+        has_header=has_header,
         columns=columns,
     )
 
@@ -495,52 +519,32 @@ def _identify_workbook(wb: str | Path | IO[bytes] | bytes) -> str | None:
 def _read_spreadsheet(
     sheet_id: int | Sequence[int] | None,
     sheet_name: str | list[str] | tuple[str] | None,
+    *,
     source: str | Path | IO[bytes] | bytes,
     engine: ExcelSpreadsheetEngine,
     engine_options: dict[str, Any] | None = None,
     read_options: dict[str, Any] | None = None,
     schema_overrides: SchemaDict | None = None,
     infer_schema_length: int | None = N_INFER_DEFAULT,
     columns: Sequence[int] | Sequence[str] | None = None,
-    *,
+    has_header: bool = True,
     raise_if_empty: bool = True,
 ) -> pl.DataFrame | dict[str, pl.DataFrame]:
     if isinstance(source, (str, Path)):
         source = normalize_filepath(source)
         if looks_like_url(source):
             source = process_file_url(source)
 
-    read_options = (read_options or {}).copy()
+    read_options = _get_read_options(
+        read_options,
+        engine=engine,
+        columns=columns,
+        has_header=has_header,
+        infer_schema_length=infer_schema_length,
+    )
     engine_options = (engine_options or {}).copy()
     schema_overrides = dict(schema_overrides or {})
 
-    # normalise some top-level parameters to 'read_options' entries
-    if engine == "calamine":
-        if ("use_columns" in read_options) and columns:
-            msg = 'cannot specify both `columns` and `read_options["use_columns"]`'
-            raise ParameterCollisionError(msg)
-        elif ("schema_sample_rows" in read_options) and (
-            infer_schema_length != N_INFER_DEFAULT
-        ):
-            msg = 'cannot specify both `infer_schema_length` and `read_options["schema_sample_rows"]`'
-            raise ParameterCollisionError(msg)
-
-        read_options["schema_sample_rows"] = infer_schema_length
-
-    elif engine == "xlsx2csv":
-        if ("columns" in read_options) and columns:
-            msg = 'cannot specify both `columns` and `read_options["columns"]`'
-            raise ParameterCollisionError(msg)
-        elif ("infer_schema_length" in read_options) and (
-            infer_schema_length != N_INFER_DEFAULT
-        ):
-            msg = 'cannot specify both `infer_schema_length` and `read_options["infer_schema_length"]`'
-            raise ParameterCollisionError(msg)
-
-        read_options["infer_schema_length"] = infer_schema_length
-    else:
-        read_options["infer_schema_length"] = infer_schema_length
-
     # establish the reading function, parser, and available worksheets
     reader_fn, parser, worksheets = _initialise_spreadsheet_parser(
         engine, source, engine_options
@@ -573,6 +577,59 @@ def _read_spreadsheet(
     return next(iter(parsed_sheets.values()))
 
 
+def _get_read_options(
+    read_options: dict[str, Any] | None,
+    *,
+    engine: ExcelSpreadsheetEngine,
+    columns: Sequence[int] | Sequence[str] | None,
+    infer_schema_length: int | None,
+    has_header: bool,
+) -> dict[str, Any]:
+    """Normalise top-level parameters to engine-specific 'read_options' dict."""
+    read_options = (read_options or {}).copy()
+    if engine == "calamine":
+        if ("use_columns" in read_options) and columns:
+            msg = 'cannot specify both `columns` and `read_options["use_columns"]`'
+            raise ParameterCollisionError(msg)
+        elif read_options.get("header_row") is not None and has_header is False:
+            msg = 'the values of `has_header` and `read_options["header_row"]` are not compatible'
+            raise ParameterCollisionError(msg)
+        elif ("schema_sample_rows" in read_options) and (
+            infer_schema_length != N_INFER_DEFAULT
+        ):
+            msg = 'cannot specify both `infer_schema_length` and `read_options["schema_sample_rows"]`'
+            raise ParameterCollisionError(msg)
+
+        read_options["schema_sample_rows"] = infer_schema_length
+        if has_header is False and "header_row" not in read_options:
+            read_options["header_row"] = None
+
+    elif engine == "xlsx2csv":
+        if ("columns" in read_options) and columns:
+            msg = 'cannot specify both `columns` and `read_options["columns"]`'
+            raise ParameterCollisionError(msg)
+        elif (
+            "has_header" in read_options
+            and read_options["has_header"] is not has_header
+        ):
+            msg = 'the values of `has_header` and `read_options["has_header"]` are not compatible'
+            raise ParameterCollisionError(msg)
+        elif ("infer_schema_length" in read_options) and (
+            infer_schema_length != N_INFER_DEFAULT
+        ):
+            msg = 'cannot specify both `infer_schema_length` and `read_options["infer_schema_length"]`'
+            raise ParameterCollisionError(msg)
+
+        read_options["infer_schema_length"] = infer_schema_length
+        if "has_header" not in read_options:
+            read_options["has_header"] = has_header
+    else:
+        read_options["infer_schema_length"] = infer_schema_length
+        read_options["has_header"] = has_header
+
+    return read_options
+
+
 def _get_sheet_names(
     sheet_id: int | Sequence[int] | None,
     sheet_name: str | list[str] | tuple[str] | None,
@@ -695,13 +752,7 @@ def _csv_buffer_to_frame(
     """Translate StringIO buffer containing delimited data as a DataFrame."""
     # handle (completely) empty sheet data
     if csv.tell() == 0:
-        if raise_if_empty:
-            msg = (
-                "empty Excel sheet"
-                "\n\nIf you want to read this as an empty DataFrame, set `raise_if_empty=False`."
-            )
-            raise NoDataError(msg)
-        return pl.DataFrame()
+        return _empty_frame(raise_if_empty)
 
     if read_options is None:
         read_options = {}
@@ -754,18 +805,21 @@ def _drop_null_data(df: pl.DataFrame, *, raise_if_empty: bool) -> pl.DataFrame:
         df = df.drop(*null_cols)
 
     if len(df) == 0 and len(df.columns) == 0:
-        if not raise_if_empty:
-            return df
-        else:
-            msg = (
-                "empty Excel sheet"
-                "\n\nIf you want to read this as an empty DataFrame, set `raise_if_empty=False`."
-            )
-            raise NoDataError(msg)
+        return _empty_frame(raise_if_empty)
 
     return df.filter(~F.all_horizontal(F.all().is_null()))
 
 
+def _empty_frame(raise_if_empty: bool) -> pl.DataFrame:  # noqa: FBT001
+    if raise_if_empty:
+        msg = (
+            "empty Excel sheet"
+            "\n\nIf you want to read this as an empty DataFrame, set `raise_if_empty=False`."
+        )
+        raise NoDataError(msg)
+    return pl.DataFrame()
+
+
 def _reorder_columns(
     df: pl.DataFrame, columns: Sequence[int] | Sequence[str] | None
 ) -> pl.DataFrame:
@@ -788,6 +842,7 @@ def _read_spreadsheet_openpyxl(
 ) -> pl.DataFrame:
     """Use the 'openpyxl' library to read data from the given worksheet."""
     infer_schema_length = read_options.pop("infer_schema_length", None)
+    has_header = read_options.pop("has_header", True)
     no_inference = infer_schema_length == 0
     ws = parser[sheet_name]
 
@@ -797,26 +852,37 @@ def _read_spreadsheet_openpyxl(
     if tables := getattr(ws, "tables", None):
         table = next(iter(tables.values()))
         rows = list(ws[table.ref])
-        header.extend(cell.value for cell in rows.pop(0))
+        if not rows:
+            return _empty_frame(raise_if_empty)
+        if has_header:
+            header.extend(cell.value for cell in rows.pop(0))
+        else:
+            header.extend(f"column_{n}" for n in range(1, len(rows[0]) + 1))
         if table.totalsRowCount:
             rows = rows[: -table.totalsRowCount]
-        rows_iter = iter(rows)
+        rows_iter = rows
     else:
-        rows_iter = ws.iter_rows()
-        for row in rows_iter:
-            row_values = [cell.value for cell in row]
-            if any(v is not None for v in row_values):
-                header.extend(row_values)
-                break
+        if not has_header:
+            if not (rows_iter := list(ws.iter_rows())):
+                return _empty_frame(raise_if_empty)
+            n_cols = len(rows_iter[0])
+            header = [f"column_{n}" for n in range(1, n_cols + 1)]
+        else:
+            rows_iter = ws.iter_rows()
+            for row in rows_iter:
+                row_values = [cell.value for cell in row]
+                if any(v is not None for v in row_values):
+                    header.extend(row_values)
+                    break
 
     dtype = String if no_inference else None
     series_data = []
     for name, column_data in zip(header, zip(*rows_iter)):
         if name:
             values = [cell.value for cell in column_data]
             if no_inference or (dtype := (schema_overrides or {}).get(name)) == String:  # type: ignore[assignment]
-                # note: if we init series with mixed-type data (eg: str/int)
-                # the non-strings will become null, so we handle the cast here
+                # note: if we initialise the series with mixed-type data (eg: str/int)
+                # then the non-strings will become null, so we handle the cast here
                 values = [str(v) if (v is not None) else v for v in values]
 
             s = pl.Series(name, values, dtype=dtype, strict=False)
@@ -889,6 +955,10 @@ def _read_spreadsheet_calamine(
     else:
         ws_arrow = parser.load_sheet_eager(sheet_name, **read_options)
         df = from_arrow(ws_arrow)
+        if read_options.get("header_row", False) is None and not read_options.get(
+            "column_names"
+        ):
+            df.columns = [f"column_{i}" for i in range(1, len(df.columns) + 1)]
 
     # note: even if we applied parser dtypes we still re-apply schema_overrides
     # natively as we can refine integer/float types, temporal precision, etc.