Skip to content

Commit

Permalink
feat(python): Expose top-level "has_header" param in read_excel acr…
Browse files Browse the repository at this point in the history
…oss all supported engines
  • Loading branch information
alexander-beedie committed Aug 13, 2024
1 parent fac700d commit c5d3b87
Show file tree
Hide file tree
Showing 2 changed files with 145 additions and 59 deletions.
178 changes: 124 additions & 54 deletions py-polars/polars/io/spreadsheet/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def read_excel(
engine: ExcelSpreadsheetEngine = ...,
engine_options: dict[str, Any] | None = ...,
read_options: dict[str, Any] | None = ...,
has_header: bool = ...,
columns: Sequence[int] | Sequence[str] | None = ...,
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
Expand All @@ -65,6 +66,7 @@ def read_excel(
sheet_name: None = ...,
engine: ExcelSpreadsheetEngine = ...,
engine_options: dict[str, Any] | None = ...,
has_header: bool = ...,
read_options: dict[str, Any] | None = ...,
columns: Sequence[int] | Sequence[str] | None = ...,
schema_overrides: SchemaDict | None = ...,
Expand All @@ -82,6 +84,7 @@ def read_excel(
engine: ExcelSpreadsheetEngine = ...,
engine_options: dict[str, Any] | None = ...,
read_options: dict[str, Any] | None = ...,
has_header: bool = ...,
columns: Sequence[int] | Sequence[str] | None = ...,
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
Expand All @@ -100,6 +103,7 @@ def read_excel(
engine: ExcelSpreadsheetEngine = ...,
engine_options: dict[str, Any] | None = ...,
read_options: dict[str, Any] | None = ...,
has_header: bool = ...,
columns: Sequence[int] | Sequence[str] | None = ...,
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
Expand All @@ -116,6 +120,7 @@ def read_excel(
engine: ExcelSpreadsheetEngine = ...,
engine_options: dict[str, Any] | None = ...,
read_options: dict[str, Any] | None = ...,
has_header: bool = ...,
columns: Sequence[int] | Sequence[str] | None = ...,
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
Expand All @@ -132,6 +137,7 @@ def read_excel(
engine: ExcelSpreadsheetEngine = ...,
engine_options: dict[str, Any] | None = ...,
read_options: dict[str, Any] | None = ...,
has_header: bool = ...,
columns: Sequence[int] | Sequence[str] | None = ...,
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
Expand All @@ -149,6 +155,7 @@ def read_excel(
engine: ExcelSpreadsheetEngine = "calamine",
engine_options: dict[str, Any] | None = None,
read_options: dict[str, Any] | None = None,
has_header: bool = True,
columns: Sequence[int] | Sequence[str] | None = None,
schema_overrides: SchemaDict | None = None,
infer_schema_length: int | None = N_INFER_DEFAULT,
Expand Down Expand Up @@ -207,6 +214,10 @@ def read_excel(
* "calamine": `ExcelReader.load_sheet_by_name`
* "xlsx2csv": `pl.read_csv`
* "openpyxl": n/a (can only provide `engine_options`)
has_header
Indicate if the first row of the table data is a header or not. If False,
column names will be autogenerated in the following format: `column_x`, with
`x` being an enumeration over every column in the dataset, starting at 1.
columns
Columns to read from the sheet; if not specified, all columns are read. Can
be given as a sequence of column names or indices.
Expand Down Expand Up @@ -285,6 +296,7 @@ def read_excel(
schema_overrides=schema_overrides,
infer_schema_length=infer_schema_length,
raise_if_empty=raise_if_empty,
has_header=has_header,
columns=columns,
)

Expand All @@ -295,6 +307,7 @@ def read_ods(
*,
sheet_id: None = ...,
sheet_name: str,
has_header: bool = ...,
columns: Sequence[int] | Sequence[str] | None = ...,
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
Expand All @@ -308,6 +321,7 @@ def read_ods(
*,
sheet_id: None = ...,
sheet_name: None = ...,
has_header: bool = ...,
columns: Sequence[int] | Sequence[str] | None = ...,
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
Expand All @@ -321,6 +335,7 @@ def read_ods(
*,
sheet_id: int,
sheet_name: str,
has_header: bool = ...,
columns: Sequence[int] | Sequence[str] | None = ...,
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
Expand All @@ -334,6 +349,7 @@ def read_ods(
*,
sheet_id: Literal[0] | Sequence[int],
sheet_name: None = ...,
has_header: bool = ...,
columns: Sequence[int] | Sequence[str] | None = ...,
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
Expand All @@ -347,6 +363,7 @@ def read_ods(
*,
sheet_id: int,
sheet_name: None = ...,
has_header: bool = ...,
columns: Sequence[int] | Sequence[str] | None = ...,
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
Expand All @@ -360,6 +377,7 @@ def read_ods(
*,
sheet_id: None,
sheet_name: list[str] | tuple[str],
has_header: bool = ...,
columns: Sequence[int] | Sequence[str] | None = ...,
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
Expand All @@ -372,6 +390,7 @@ def read_ods(
*,
sheet_id: int | Sequence[int] | None = None,
sheet_name: str | list[str] | tuple[str] | None = None,
has_header: bool = True,
columns: Sequence[int] | Sequence[str] | None = None,
schema_overrides: SchemaDict | None = None,
infer_schema_length: int | None = N_INFER_DEFAULT,
Expand All @@ -396,6 +415,10 @@ def read_ods(
sheet_name
Sheet name(s) to convert; cannot be used in conjunction with `sheet_id`. If
more than one is given then a `{sheetname:frame,}` dict is returned.
has_header
Indicate if the first row of the table data is a header or not. If False,
column names will be autogenerated in the following format: `column_x`, with
`x` being an enumeration over every column in the dataset, starting at 1.
columns
Columns to read from the sheet; if not specified, all columns are read. Can
be given as a sequence of column names or indices.
Expand Down Expand Up @@ -446,6 +469,7 @@ def read_ods(
schema_overrides=schema_overrides,
infer_schema_length=infer_schema_length,
raise_if_empty=raise_if_empty,
has_header=has_header,
columns=columns,
)

Expand Down Expand Up @@ -495,52 +519,32 @@ def _identify_workbook(wb: str | Path | IO[bytes] | bytes) -> str | None:
def _read_spreadsheet(
sheet_id: int | Sequence[int] | None,
sheet_name: str | list[str] | tuple[str] | None,
*,
source: str | Path | IO[bytes] | bytes,
engine: ExcelSpreadsheetEngine,
engine_options: dict[str, Any] | None = None,
read_options: dict[str, Any] | None = None,
schema_overrides: SchemaDict | None = None,
infer_schema_length: int | None = N_INFER_DEFAULT,
columns: Sequence[int] | Sequence[str] | None = None,
*,
has_header: bool = True,
raise_if_empty: bool = True,
) -> pl.DataFrame | dict[str, pl.DataFrame]:
if isinstance(source, (str, Path)):
source = normalize_filepath(source)
if looks_like_url(source):
source = process_file_url(source)

read_options = (read_options or {}).copy()
read_options = _get_read_options(
read_options,
engine=engine,
columns=columns,
has_header=has_header,
infer_schema_length=infer_schema_length,
)
engine_options = (engine_options or {}).copy()
schema_overrides = dict(schema_overrides or {})

# normalise some top-level parameters to 'read_options' entries
if engine == "calamine":
if ("use_columns" in read_options) and columns:
msg = 'cannot specify both `columns` and `read_options["use_columns"]`'
raise ParameterCollisionError(msg)
elif ("schema_sample_rows" in read_options) and (
infer_schema_length != N_INFER_DEFAULT
):
msg = 'cannot specify both `infer_schema_length` and `read_options["schema_sample_rows"]`'
raise ParameterCollisionError(msg)

read_options["schema_sample_rows"] = infer_schema_length

elif engine == "xlsx2csv":
if ("columns" in read_options) and columns:
msg = 'cannot specify both `columns` and `read_options["columns"]`'
raise ParameterCollisionError(msg)
elif ("infer_schema_length" in read_options) and (
infer_schema_length != N_INFER_DEFAULT
):
msg = 'cannot specify both `infer_schema_length` and `read_options["infer_schema_length"]`'
raise ParameterCollisionError(msg)

read_options["infer_schema_length"] = infer_schema_length
else:
read_options["infer_schema_length"] = infer_schema_length

# establish the reading function, parser, and available worksheets
reader_fn, parser, worksheets = _initialise_spreadsheet_parser(
engine, source, engine_options
Expand Down Expand Up @@ -573,6 +577,59 @@ def _read_spreadsheet(
return next(iter(parsed_sheets.values()))


def _get_read_options(
read_options: dict[str, Any] | None,
*,
engine: ExcelSpreadsheetEngine,
columns: Sequence[int] | Sequence[str] | None,
infer_schema_length: int | None,
has_header: bool,
) -> dict[str, Any]:
"""Normalise top-level parameters to engine-specific 'read_options' dict."""
read_options = (read_options or {}).copy()
if engine == "calamine":
if ("use_columns" in read_options) and columns:
msg = 'cannot specify both `columns` and `read_options["use_columns"]`'
raise ParameterCollisionError(msg)
elif read_options.get("header_row") is not None and has_header is False:
msg = 'the values of `has_header` and `read_options["header_row"]` are not compatible'
raise ParameterCollisionError(msg)
elif ("schema_sample_rows" in read_options) and (
infer_schema_length != N_INFER_DEFAULT
):
msg = 'cannot specify both `infer_schema_length` and `read_options["schema_sample_rows"]`'
raise ParameterCollisionError(msg)

read_options["schema_sample_rows"] = infer_schema_length
if has_header is False and "header_row" not in read_options:
read_options["header_row"] = None

elif engine == "xlsx2csv":
if ("columns" in read_options) and columns:
msg = 'cannot specify both `columns` and `read_options["columns"]`'
raise ParameterCollisionError(msg)
elif (
"has_header" in read_options
and read_options["has_header"] is not has_header
):
msg = 'the values of `has_header` and `read_options["has_header"]` are not compatible'
raise ParameterCollisionError(msg)
elif ("infer_schema_length" in read_options) and (
infer_schema_length != N_INFER_DEFAULT
):
msg = 'cannot specify both `infer_schema_length` and `read_options["infer_schema_length"]`'
raise ParameterCollisionError(msg)

read_options["infer_schema_length"] = infer_schema_length
if "has_header" not in read_options:
read_options["has_header"] = has_header
else:
read_options["infer_schema_length"] = infer_schema_length
read_options["has_header"] = has_header

return read_options


def _get_sheet_names(
sheet_id: int | Sequence[int] | None,
sheet_name: str | list[str] | tuple[str] | None,
Expand Down Expand Up @@ -695,13 +752,7 @@ def _csv_buffer_to_frame(
"""Translate StringIO buffer containing delimited data as a DataFrame."""
# handle (completely) empty sheet data
if csv.tell() == 0:
if raise_if_empty:
msg = (
"empty Excel sheet"
"\n\nIf you want to read this as an empty DataFrame, set `raise_if_empty=False`."
)
raise NoDataError(msg)
return pl.DataFrame()
return _empty_frame(raise_if_empty)

if read_options is None:
read_options = {}
Expand Down Expand Up @@ -754,18 +805,21 @@ def _drop_null_data(df: pl.DataFrame, *, raise_if_empty: bool) -> pl.DataFrame:
df = df.drop(*null_cols)

if len(df) == 0 and len(df.columns) == 0:
if not raise_if_empty:
return df
else:
msg = (
"empty Excel sheet"
"\n\nIf you want to read this as an empty DataFrame, set `raise_if_empty=False`."
)
raise NoDataError(msg)
return _empty_frame(raise_if_empty)

return df.filter(~F.all_horizontal(F.all().is_null()))


def _empty_frame(raise_if_empty: bool) -> pl.DataFrame: # noqa: FBT001
if raise_if_empty:
msg = (
"empty Excel sheet"
"\n\nIf you want to read this as an empty DataFrame, set `raise_if_empty=False`."
)
raise NoDataError(msg)
return pl.DataFrame()


def _reorder_columns(
df: pl.DataFrame, columns: Sequence[int] | Sequence[str] | None
) -> pl.DataFrame:
Expand All @@ -788,6 +842,7 @@ def _read_spreadsheet_openpyxl(
) -> pl.DataFrame:
"""Use the 'openpyxl' library to read data from the given worksheet."""
infer_schema_length = read_options.pop("infer_schema_length", None)
has_header = read_options.pop("has_header", True)
no_inference = infer_schema_length == 0
ws = parser[sheet_name]

Expand All @@ -797,26 +852,37 @@ def _read_spreadsheet_openpyxl(
if tables := getattr(ws, "tables", None):
table = next(iter(tables.values()))
rows = list(ws[table.ref])
header.extend(cell.value for cell in rows.pop(0))
if not rows:
return _empty_frame(raise_if_empty)
if has_header:
header.extend(cell.value for cell in rows.pop(0))
else:
header.extend(f"column_{n}" for n in range(1, len(rows[0]) + 1))
if table.totalsRowCount:
rows = rows[: -table.totalsRowCount]
rows_iter = iter(rows)
rows_iter = rows
else:
rows_iter = ws.iter_rows()
for row in rows_iter:
row_values = [cell.value for cell in row]
if any(v is not None for v in row_values):
header.extend(row_values)
break
if not has_header:
if not (rows_iter := list(ws.iter_rows())):
return _empty_frame(raise_if_empty)
n_cols = len(rows_iter[0])
header = [f"column_{n}" for n in range(1, n_cols + 1)]
else:
rows_iter = ws.iter_rows()
for row in rows_iter:
row_values = [cell.value for cell in row]
if any(v is not None for v in row_values):
header.extend(row_values)
break

dtype = String if no_inference else None
series_data = []
for name, column_data in zip(header, zip(*rows_iter)):
if name:
values = [cell.value for cell in column_data]
if no_inference or (dtype := (schema_overrides or {}).get(name)) == String: # type: ignore[assignment]
# note: if we init series with mixed-type data (eg: str/int)
# the non-strings will become null, so we handle the cast here
# note: if we initialise the series with mixed-type data (eg: str/int)
# then the non-strings will become null, so we handle the cast here
values = [str(v) if (v is not None) else v for v in values]

s = pl.Series(name, values, dtype=dtype, strict=False)
Expand Down Expand Up @@ -889,6 +955,10 @@ def _read_spreadsheet_calamine(
else:
ws_arrow = parser.load_sheet_eager(sheet_name, **read_options)
df = from_arrow(ws_arrow)
if read_options.get("header_row", False) is None and not read_options.get(
"column_names"
):
df.columns = [f"column_{i}" for i in range(1, len(df.columns) + 1)]

# note: even if we applied parser dtypes we still re-apply schema_overrides
# natively as we can refine integer/float types, temporal precision, etc.
Expand Down
Loading

0 comments on commit c5d3b87

Please sign in to comment.