forked from dask-contrib/dask-deltatable
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Pass a dict instead of ParquetFileWriteOptions that can't be pickled.
- Loading branch information
Showing
3 changed files
with
112 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
# flake8 doesn't support pyproject.toml yet https://github.com/PyCQA/flake8/issues/234 | ||
[flake8] | ||
exclude = __init__.py | ||
max-line-length = 120 | ||
ignore = | ||
# Extra space in brackets | ||
E20 | ||
# Multiple spaces around "," | ||
E231,E241 | ||
# Comments | ||
E26 | ||
# Import formatting | ||
E4 | ||
# Comparing types instead of isinstance | ||
E721 | ||
# Assigning lambda expression | ||
E731 | ||
# Ambiguous variable names | ||
E741 | ||
# Line break before binary operator | ||
W503 | ||
# Line break after binary operator | ||
W504 | ||
# Redefinition of unused 'loop' from line 10 | ||
F811 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
from __future__ import annotations | ||
|
||
import pytest | ||
|
||
distributed = pytest.importorskip("distributed") | ||
|
||
import os # noqa: E402 | ||
import sys # noqa: E402 | ||
|
||
import pyarrow as pa # noqa: E402 | ||
import pyarrow.dataset as pa_ds # noqa: E402 | ||
import pyarrow.parquet as pq # noqa: E402 | ||
from dask.datasets import timeseries # noqa: E402 | ||
from distributed.utils_test import cleanup # noqa F401 | ||
from distributed.utils_test import ( # noqa F401 | ||
client, | ||
cluster, | ||
cluster_fixture, | ||
gen_cluster, | ||
loop, | ||
loop_in_thread, | ||
popen, | ||
varying, | ||
) | ||
|
||
import dask_deltatable as ddt # noqa: E402 | ||
|
||
pytestmark = pytest.mark.skipif( | ||
sys.platform == "win32", | ||
reason=( | ||
"The teardown of distributed.utils_test.cluster_fixture " | ||
"fails on windows CI currently" | ||
), | ||
) | ||
|
||
|
||
def test_write(client, tmpdir): | ||
ddf = timeseries( | ||
start="2023-01-01", | ||
end="2023-01-03", | ||
freq="1H", | ||
partition_freq="1D", | ||
dtypes={"str": object, "float": float, "int": int}, | ||
).reset_index() | ||
ddt.to_deltalake(f"{tmpdir}", ddf) | ||
|
||
|
||
def test_write_with_options(client, tmpdir): | ||
file_options = dict(compression="gzip") | ||
ddf = timeseries( | ||
start="2023-01-01", | ||
end="2023-01-03", | ||
freq="1H", | ||
partition_freq="1D", | ||
dtypes={"str": object, "float": float, "int": int}, | ||
).reset_index() | ||
ddt.to_deltalake(f"{tmpdir}", ddf, file_options=file_options) | ||
parquet_filename = [f for f in os.listdir(tmpdir) if f.endswith(".parquet")][0] | ||
parquet_file = pq.ParquetFile(f"{tmpdir}/{parquet_filename}") | ||
assert parquet_file.metadata.row_group(0).column(0).compression == "GZIP" | ||
|
||
|
||
def test_write_with_schema(client, tmpdir): | ||
ddf = timeseries( | ||
start="2023-01-01", | ||
end="2023-01-03", | ||
freq="1H", | ||
partition_freq="1D", | ||
dtypes={"str": object, "float": float, "int": int}, | ||
).reset_index() | ||
schema = pa.schema( | ||
[ | ||
pa.field("timestamp", pa.timestamp("us")), | ||
pa.field("str", pa.string()), | ||
pa.field("float", pa.float32()), | ||
pa.field("int", pa.int32()), | ||
] | ||
) | ||
ddt.to_deltalake(f"{tmpdir}", ddf, schema=schema) | ||
ds = pa_ds.dataset(str(tmpdir)) | ||
assert ds.schema == schema |