From 8ece165a2d960592d724ccaaf7bdf67a325a6300 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Tue, 10 Oct 2023 16:25:43 +0200 Subject: [PATCH] docs(python): Improved user guide for cloud functionality (#11646) --- CONTRIBUTING.md | 61 +++++++++++++++++- docs/_build/API_REFERENCE_LINKS.yml | 13 +++- docs/src/python/user-guide/io/aws.py | 14 ----- .../src/python/user-guide/io/cloud-storage.py | 63 +++++++++++++++++++ docs/src/python/user-guide/io/database.py | 32 +++++++--- docs/src/python/user-guide/io/json.py | 24 +++++++ .../lazy/{query_plan.py => query-plan.py} | 0 .../sql/{sql_select.py => select.py} | 0 .../io/{aws.rs => cloud-storage.rs} | 17 ++++- .../user-guide/io/{json-file.rs => json.rs} | 0 docs/user-guide/io/aws.md | 20 ------ docs/user-guide/io/cloud-storage.md | 51 +++++++++++++++ docs/user-guide/io/database.md | 29 ++++++--- docs/user-guide/io/{json_file.md => json.md} | 14 +++-- docs/user-guide/io/parquet.md | 15 ++--- .../lazy/{query_plan.md => query-plan.md} | 30 ++++----- docs/user-guide/sql/select.md | 26 ++++---- mkdocs.yml | 6 +- 18 files changed, 313 insertions(+), 102 deletions(-) delete mode 100644 docs/src/python/user-guide/io/aws.py create mode 100644 docs/src/python/user-guide/io/cloud-storage.py create mode 100644 docs/src/python/user-guide/io/json.py rename docs/src/python/user-guide/lazy/{query_plan.py => query-plan.py} (100%) rename docs/src/python/user-guide/sql/{sql_select.py => select.py} (100%) rename docs/src/rust/user-guide/io/{aws.rs => cloud-storage.rs} (67%) rename docs/src/rust/user-guide/io/{json-file.rs => json.rs} (100%) delete mode 100644 docs/user-guide/io/aws.md create mode 100644 docs/user-guide/io/cloud-storage.md rename docs/user-guide/io/{json_file.md => json.md} (50%) rename docs/user-guide/lazy/{query_plan.md => query-plan.md} (73%) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 17cd40831cc1..44321d2f35bb 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -151,8 +151,65 @@ The most important components of Polars documentation are the [user guide](https ### User guide -The user guide is maintained in the `docs` folder. -Further contributing information will be added shortly. +The user guide is maintained in the `docs/user-guide` folder. Before creating a PR first raise an issue to discuss what you feel is missing or could be improved. + +#### Building and serving the user guide + +The user guide is built using [MkDocs](https://www.mkdocs.org/). You install the dependencies for building the user guide by running `make requirements` in the root of the repo. + +Run `mkdocs serve` to build and serve the user guide so you can view it locally and see updates as you make changes. + +#### Creating a new user guide page + +Each user guide page is based on a `.md` markdown file. This file must be listed in `mkdocs.yml`. + +#### Adding a shell code block + +To add a code block with code to be run in a shell with tabs for Python and Rust, use the following format: + +```` +=== ":fontawesome-brands-python: Python" + + ```shell + $ pip install fsspec + ``` + +=== ":fontawesome-brands-rust: Rust" + + ```shell + $ cargo add aws_sdk_s3 + ``` +```` + +#### Adding a code block + +The snippets for Python and Rust code blocks are in the `docs/src/python/` and `docs/src/rust/` directories, respectively. To add a code snippet with Python or Rust code to a `.md` page, use the following format: + +``` +{{code_block('user-guide/io/cloud-storage','read_parquet',[read_parquet,read_csv])}} +``` + +- The first argument is a path to either or both files called `docs/src/python/user-guide/io/cloud-storage.py` and `docs/src/rust/user-guide/io/cloud-storage.rs`. +- The second argument is the name given at the start and end of each snippet in the `.py` or `.rs` file +- The third argument is a list of links to functions in the API docs. For each element of the list there must be a corresponding entry in `docs/_build/API_REFERENCE_LINKS.yml` + +If the corresponding `.py` and `.rs` snippet files both exist then each snippet named in the second argument to `code_block` above must exist or the build will fail. An empty snippet should be added to the `.py` or `.rs` file if the snippet is not needed. + +Each snippet is formatted as follows: + +```python +# --8<-- [start:read_parquet] +import polars as pl + +df = pl.read_parquet("file.parquet") +# --8<-- [end:read_parquet] +``` + +The snippet is delimited by `--8<-- [start:]` and `--8<-- [end:]`. The snippet name must match the name given in the second argument to `code_block` above. + +#### Linting + +Before committing, install `dprint` (see above) and run `dprint fmt` from the `docs` directory to lint the markdown files. ### API reference diff --git a/docs/_build/API_REFERENCE_LINKS.yml b/docs/_build/API_REFERENCE_LINKS.yml index bad064287873..73b437cc9324 100644 --- a/docs/_build/API_REFERENCE_LINKS.yml +++ b/docs/_build/API_REFERENCE_LINKS.yml @@ -12,8 +12,7 @@ python: write_csv: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_csv.html read_json: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.read_json.html write_json: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_json.html - read_parquet: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.read_parquet.html - write_parquet: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_parquet.html + read_ipc: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.read_ipc.html min: https://pola-rs.github.io/polars/py-polars/html/reference/series/api/polars.Series.min.html max: https://pola-rs.github.io/polars/py-polars/html/reference/series/api/polars.Series.max.html value_counts: https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.value_counts.html @@ -65,6 +64,7 @@ python: write_database: name: write_database link: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_database.html + read_database_uri: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.read_database_uri.html read_parquet: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.read_parquet.html write_parquet: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_parquet.html scan_parquet: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.scan_parquet.html @@ -73,6 +73,7 @@ python: write_ndjson: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_ndjson.html write_json: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.DataFrame.write_json.html scan_ndjson: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.scan_ndjson.html + scan_pyarrow_dataset: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.scan_pyarrow_dataset.html from_arrow: name: from_arrow link: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.from_arrow.html @@ -197,7 +198,7 @@ rust: feature_flags: ['json'] read_ndjson: name: JsonLineReader - link: https://pola-rs.github.io/polars/docs/rust/dev/polars_io/ndjson_core/ndjson/struct.JsonLineReader.html + link: https://pola-rs.github.io/polars/docs/rust/dev/polars_io/ndjson/core/struct.JsonLineReader.html feature_flags: ['json'] write_json: name: JsonWriter @@ -223,6 +224,12 @@ rust: name: scan_parquet link: https://pola-rs.github.io/polars/docs/rust/dev/polars/prelude/struct.LazyFrame.html#method.scan_parquet feature_flags: ['parquet'] + read_ipc: + name: IpcReader + link: https://pola-rs.github.io/polars/docs/rust/dev/polars_io/prelude/struct.IpcReader.html + feature_flags: ['ipc'] + scan_pyarrow_dataset: https://pola-rs.github.io/polars/py-polars/html/reference/api/polars.scan_pyarrow_dataset.html + min: https://pola-rs.github.io/polars/docs/rust/dev/polars/series/struct.Series.html#method.min max: https://pola-rs.github.io/polars/docs/rust/dev/polars/series/struct.Series.html#method.max struct: diff --git a/docs/src/python/user-guide/io/aws.py b/docs/src/python/user-guide/io/aws.py deleted file mode 100644 index c8bfa94941d2..000000000000 --- a/docs/src/python/user-guide/io/aws.py +++ /dev/null @@ -1,14 +0,0 @@ -""" -# --8<-- [start:bucket] -import polars as pl -import pyarrow.parquet as pq -import s3fs - -fs = s3fs.S3FileSystem() -bucket = "" -path = "" - -dataset = pq.ParquetDataset(f"s3://{bucket}/{path}", filesystem=fs) -df = pl.from_arrow(dataset.read()) -# --8<-- [end:bucket] -""" diff --git a/docs/src/python/user-guide/io/cloud-storage.py b/docs/src/python/user-guide/io/cloud-storage.py new file mode 100644 index 000000000000..0f968e15f97b --- /dev/null +++ b/docs/src/python/user-guide/io/cloud-storage.py @@ -0,0 +1,63 @@ +""" +# --8<-- [start:read_parquet] +import polars as pl + +source = "s3://bucket/*.parquet" + +df = pl.read_parquet(source) +# --8<-- [end:read_parquet] + +# --8<-- [start:scan_parquet] +import polars as pl + +source = "s3://bucket/*.parquet" + +storage_options = { + "aws_access_key_id": "", + "aws_secret_access_key": "", + "aws_region": "us-east-1", +} +df = pl.scan_parquet(source, storage_options=storage_options) +# --8<-- [end:scan_parquet] + +# --8<-- [start:scan_parquet_query] +import polars as pl + +source = "s3://bucket/*.parquet" + + +df = pl.scan_parquet(source).filter(pl.col("id") < 100).select("id","value").collect() +# --8<-- [end:scan_parquet_query] + +# --8<-- [start:scan_pyarrow_dataset] +import polars as pl +import pyarrow.dataset as ds + +dset = ds.dataset("s3://my-partitioned-folder/", format="parquet") +( + pl.scan_pyarrow_dataset(dset) + .filter("foo" == "a") + .select(["foo", "bar"]) + .collect() +) +# --8<-- [end:scan_pyarrow_dataset] + +# --8<-- [start:write_parquet] + +import polars as pl +import s3fs + +df = pl.DataFrame({ + "foo": ["a", "b", "c", "d", "d"], + "bar": [1, 2, 3, 4, 5], +}) + +fs = s3fs.S3FileSystem() +destination = "s3://bucket/my_file.parquet" + +# write parquet +with fs.open(destination, mode='wb') as f: + df.write_parquet(f) +# --8<-- [end:write_parquet] + +""" diff --git a/docs/src/python/user-guide/io/database.py b/docs/src/python/user-guide/io/database.py index 97e8f659de73..b37045719995 100644 --- a/docs/src/python/user-guide/io/database.py +++ b/docs/src/python/user-guide/io/database.py @@ -1,32 +1,44 @@ """ -# --8<-- [start:read] +# --8<-- [start:read_uri] import polars as pl -connection_uri = "postgres://username:password@server:port/database" +uri = "postgres://username:password@server:port/database" query = "SELECT * FROM foo" -pl.read_database(query=query, connection_uri=connection_uri) -# --8<-- [end:read] +pl.read_database_uri(query=query, uri=uri) +# --8<-- [end:read_uri] + +# --8<-- [start:read_cursor] +import polars as pl +from sqlalchemy import create_engine + +conn = create_engine(f"sqlite:///test.db") + +query = "SELECT * FROM foo" + +pl.read_database(query=query, connection=conn.connect()) +# --8<-- [end:read_cursor] + # --8<-- [start:adbc] -connection_uri = "postgres://username:password@server:port/database" +uri = "postgres://username:password@server:port/database" query = "SELECT * FROM foo" -pl.read_database(query=query, connection_uri=connection_uri, engine="adbc") +pl.read_database_uri(query=query, uri=uri, engine="adbc") # --8<-- [end:adbc] # --8<-- [start:write] -connection_uri = "postgres://username:password@server:port/database" +uri = "postgres://username:password@server:port/database" df = pl.DataFrame({"foo": [1, 2, 3]}) -df.write_database(table_name="records", connection_uri=connection_uri) +df.write_database(table_name="records", uri=uri) # --8<-- [end:write] # --8<-- [start:write_adbc] -connection_uri = "postgres://username:password@server:port/database" +uri = "postgres://username:password@server:port/database" df = pl.DataFrame({"foo": [1, 2, 3]}) -df.write_database(table_name="records", connection_uri=connection_uri, engine="adbc") +df.write_database(table_name="records", uri=uri, engine="adbc") # --8<-- [end:write_adbc] """ diff --git a/docs/src/python/user-guide/io/json.py b/docs/src/python/user-guide/io/json.py new file mode 100644 index 000000000000..8e6ba3955dc4 --- /dev/null +++ b/docs/src/python/user-guide/io/json.py @@ -0,0 +1,24 @@ +# --8<-- [start:setup] +import polars as pl + +# --8<-- [end:setup] + +""" +# --8<-- [start:read] +df = pl.read_json("docs/data/path.json") +# --8<-- [end:read] + +# --8<-- [start:readnd] +df = pl.read_ndjson("docs/data/path.json") +# --8<-- [end:readnd] + +""" + +# --8<-- [start:write] +df = pl.DataFrame({"foo": [1, 2, 3], "bar": [None, "bak", "baz"]}) +df.write_json("docs/data/path.json") +# --8<-- [end:write] + +# --8<-- [start:scan] +df = pl.scan_ndjson("docs/data/path.json") +# --8<-- [end:scan] diff --git a/docs/src/python/user-guide/lazy/query_plan.py b/docs/src/python/user-guide/lazy/query-plan.py similarity index 100% rename from docs/src/python/user-guide/lazy/query_plan.py rename to docs/src/python/user-guide/lazy/query-plan.py diff --git a/docs/src/python/user-guide/sql/sql_select.py b/docs/src/python/user-guide/sql/select.py similarity index 100% rename from docs/src/python/user-guide/sql/sql_select.py rename to docs/src/python/user-guide/sql/select.py diff --git a/docs/src/rust/user-guide/io/aws.rs b/docs/src/rust/user-guide/io/cloud-storage.rs similarity index 67% rename from docs/src/rust/user-guide/io/aws.rs rename to docs/src/rust/user-guide/io/cloud-storage.rs index 0a1924d9d294..4118e520628d 100644 --- a/docs/src/rust/user-guide/io/aws.rs +++ b/docs/src/rust/user-guide/io/cloud-storage.rs @@ -1,5 +1,5 @@ """ -# --8<-- [start:bucket] +# --8<-- [start:read_parquet] use aws_sdk_s3::Region; use aws_config::meta::region::RegionProviderChain; @@ -28,5 +28,18 @@ async fn main() { println!("{:?}", df); } -# --8<-- [end:bucket] +# --8<-- [end:read_parquet] + +# --8<-- [start:scan_parquet] +# --8<-- [end:scan_parquet] + +# --8<-- [start:scan_parquet_query] +# --8<-- [end:scan_parquet_query] + +# --8<-- [start:scan_pyarrow_dataset] +# --8<-- [end:scan_pyarrow_dataset] + +# --8<-- [start:write_parquet] +# --8<-- [end:write_parquet] + """ diff --git a/docs/src/rust/user-guide/io/json-file.rs b/docs/src/rust/user-guide/io/json.rs similarity index 100% rename from docs/src/rust/user-guide/io/json-file.rs rename to docs/src/rust/user-guide/io/json.rs diff --git a/docs/user-guide/io/aws.md b/docs/user-guide/io/aws.md deleted file mode 100644 index 27c9cfeaf453..000000000000 --- a/docs/user-guide/io/aws.md +++ /dev/null @@ -1,20 +0,0 @@ -# AWS - ---8<-- "docs/_build/snippets/under_construction.md" - -To read from or write to an AWS bucket, additional dependencies are needed in Rust: - -=== ":fontawesome-brands-rust: Rust" - -```shell -$ cargo add aws_sdk_s3 aws_config tokio --features tokio/full -``` - -In the next few snippets we'll demonstrate interacting with a `Parquet` file -located on an AWS bucket. - -## Read - -Load a `.parquet` file using: - -{{code_block('user-guide/io/aws','bucket',[])}} diff --git a/docs/user-guide/io/cloud-storage.md b/docs/user-guide/io/cloud-storage.md new file mode 100644 index 000000000000..a10226a99e65 --- /dev/null +++ b/docs/user-guide/io/cloud-storage.md @@ -0,0 +1,51 @@ +# Cloud storage + +Polars can read and write to AWS S3, Azure Blob Storage and Google Cloud Storage. The API is the same for all three storage providers. + +To read from cloud storage, additional dependencies may be needed depending on the use case and cloud storage provider: + +=== ":fontawesome-brands-python: Python" + + ```shell + $ pip install fsspec s3fs adlfs gcsfs + ``` + +=== ":fontawesome-brands-rust: Rust" + + ```shell + $ cargo add aws_sdk_s3 aws_config tokio --features tokio/full + ``` + +## Reading from cloud storage + +Polars can read a CSV, IPC or Parquet file in eager mode from cloud storage. + +{{code_block('user-guide/io/cloud-storage','read_parquet',['read_parquet','read_csv','read_ipc'])}} + +This eager query downloads the file to a buffer in memory and creates a `DataFrame` from there. Polars uses `fsspec` to manage this download internally for all cloud storage providers. + +## Scanning from cloud storage with query optimisation + +Polars can scan a Parquet file in lazy mode from cloud storage. We may need to provide further details beyond the source url such as authentication details or storage region. Polars looks for these as environment variables but we can also do this manually by passing a `dict` as the `storage_options` argument. + +{{code_block('user-guide/io/cloud-storage','scan_parquet',['scan_parquet'])}} + +This query creates a `LazyFrame` without downloading the file. In the `LazyFrame` we have access to file metadata such as the schema. Polars uses the `object_store.rs` library internally to manage the interface with the cloud storage providers and so no extra dependencies are required in Python to scan a cloud Parquet file. + +If we create a lazy query with [predicate and projection pushdowns](../lazy/optimizations.md), the query optimiszr will apply them before the file is downloaded. This can significantly reduce the amount of data that needs to be downloaded. The query evaluation is triggered by calling `collect`. + +{{code_block('user-guide/io/cloud-storage','scan_parquet_query',[])}} + +## Scanning with PyArrow + +We can also scan from cloud storage using PyArrow. This is particularly useful for partitioned datasets such as Hive partitioning. + +We first create a PyArrow dataset and then create a `LazyFrame` from the dataset. + +{{code_block('user-guide/io/cloud-storage','scan_pyarrow_dataset',['scan_pyarrow_dataset'])}} + +## Writing to cloud storage + +We can write a `DataFrame` to cloud storage in Python using s3fs for S3, adlfs for Azure Blob Storage and gcsfs for Google Cloud Storage. In this example, we write a Parquet file to S3. + +{{code_block('user-guide/io/cloud-storage','write_parquet',['write_parquet'])}} diff --git a/docs/user-guide/io/database.md b/docs/user-guide/io/database.md index 4444e7be799e..f83706e5e79a 100644 --- a/docs/user-guide/io/database.md +++ b/docs/user-guide/io/database.md @@ -2,19 +2,31 @@ ## Read from a database -We can read from a database with Polars using the `pl.read_database` function. To use this function you need an SQL query string and a connection string called a `connection_uri`. +Polars can read from a database using the `pl.read_database_uri` and `pl.read_database` functions. -For example, the following snippet shows the general patterns for reading all columns from the `foo` table in a Postgres database: +### Difference between `read_database_uri` and `read_database` -{{code_block('user-guide/io/database','read',['read_database_connectorx'])}} +Use `pl.read_database_uri` if you want to specify the database connection with a connection string called a `uri`. For example, the following snippet shows a query to read all columns from the `foo` table in a Postgres database where we use the `uri` to connect: + +{{code_block('user-guide/io/database','read_uri',['read_database_uri'])}} + +On the other hand, use `pl.read_database` if you want to connect via a connection engine created with a library like SQLAlchemy. + +{{code_block('user-guide/io/database','read_cursor',['read_database'])}} + +Note that `pl.read_database_uri` is likely to be faster than `pl.read_database` if you are using a SQLAlchemy or DBAPI2 connection as these connections may load the data row-wise into Python before copying the data again to the column-wise Apache Arrow format. ### Engines -Polars doesn't manage connections and data transfer from databases by itself. Instead external libraries (known as _engines_) handle this. At present Polars can use two engines to read from databases: +Polars doesn't manage connections and data transfer from databases by itself. Instead, external libraries (known as _engines_) handle this. + +When using `pl.read_database`, you specify the engine when you create the connection object. When using `pl.read_database_uri`, you can specify one of two engines to read from the database: - [ConnectorX](https://github.com/sfu-db/connector-x) and - [ADBC](https://arrow.apache.org/docs/format/ADBC.html) +Both engines have native support for Apache Arrow and so can read data directly into a Polars `DataFrame` without copying the data. + #### ConnectorX ConnectorX is the default engine and [supports numerous databases](https://github.com/sfu-db/connector-x#sources) including Postgres, Mysql, SQL Server and Redshift. ConnectorX is written in Rust and stores data in Arrow format to allow for zero-copy to Polars. @@ -35,9 +47,9 @@ It is still early days for ADBC so support for different databases is still limi $ pip install adbc-driver-sqlite ``` -As ADBC is not the default engine you must specify the engine as an argument to `pl.read_database` +As ADBC is not the default engine you must specify the engine as an argument to `pl.read_database_uri` -{{code_block('user-guide/io/database','adbc',['read_database'])}} +{{code_block('user-guide/io/database','adbc',['read_database_uri'])}} ## Write to a database @@ -62,9 +74,10 @@ In this example, we write the `DataFrame` to a table called `records` in the dat {{code_block('user-guide/io/database','write',['write_database'])}} -In the SQLAlchemy approach Polars converts the `DataFrame` to a Pandas `DataFrame` backed by PyArrow and then uses SQLAlchemy methods on a Pandas `DataFrame` to write to the database. +In the SQLAlchemy approach, Polars converts the `DataFrame` to a Pandas `DataFrame` backed by PyArrow and then uses SQLAlchemy methods on a Pandas `DataFrame` to write to the database. #### ADBC -As with reading from a database you can also use ADBC to write to a SQLite or Posgres database. As shown above you need to install the appropriate ADBC driver for your database. +As with reading from a database, you can also use ADBC to write to a SQLite or Posgres database. As shown above, you need to install the appropriate ADBC driver for your database. + {{code_block('user-guide/io/database','write_adbc',['write_database'])}} diff --git a/docs/user-guide/io/json_file.md b/docs/user-guide/io/json.md similarity index 50% rename from docs/user-guide/io/json_file.md rename to docs/user-guide/io/json.md index 352904829c7b..c203d278ee87 100644 --- a/docs/user-guide/io/json_file.md +++ b/docs/user-guide/io/json.md @@ -1,26 +1,30 @@ # JSON files -## Read & write +Polars can read and write both standard JSON and newline-delimited JSON (NDJSON). + +## Read ### JSON Reading a JSON file should look familiar: -{{code_block('user-guide/io/json-file','read',['read_json'])}} +{{code_block('user-guide/io/json','read',['read_json'])}} ### Newline Delimited JSON JSON objects that are delimited by newlines can be read into polars in a much more performant way than standard json. -{{code_block('user-guide/io/json-file','readnd',['read_ndjson'])}} +Polars can read an NDJSON file into a `DataFrame` using the `read_ndjson` function: + +{{code_block('user-guide/io/json','readnd',['read_ndjson'])}} ## Write -{{code_block('user-guide/io/json-file','write',['write_json','write_ndjson'])}} +{{code_block('user-guide/io/json','write',['write_json','write_ndjson'])}} ## Scan `Polars` allows you to _scan_ a JSON input **only for newline delimited json**. Scanning delays the actual parsing of the file and instead returns a lazy computation holder called a `LazyFrame`. -{{code_block('user-guide/io/json-file','scan',['scan_ndjson'])}} +{{code_block('user-guide/io/json','scan',['scan_ndjson'])}} diff --git a/docs/user-guide/io/parquet.md b/docs/user-guide/io/parquet.md index 71a5399bb393..c08efc2e1b9b 100644 --- a/docs/user-guide/io/parquet.md +++ b/docs/user-guide/io/parquet.md @@ -1,13 +1,13 @@ # Parquet -Loading or writing [`Parquet` files](https://parquet.apache.org/) is lightning fast. -`Pandas` uses [`PyArrow`](https://arrow.apache.org/docs/python/) -`Python` bindings -exposed by `Arrow`- to load `Parquet` files into memory, but it has to copy that data into -`Pandas` memory. With `Polars` there is no extra cost due to -copying as we read `Parquet` directly into `Arrow` memory and _keep it there_. +Loading or writing [`Parquet` files](https://parquet.apache.org/) is lightning fast as the layout of data in a Polars `DataFrame` in memory mirrors the layout of a Parquet file on disk in many respects. + +Unlike CSV, Parquet is a columnar format. This means that the data is stored in columns rather than rows. This is a more efficient way of storing data as it allows for better compression and faster access to data. ## Read +We can read a `Parquet` file into a `DataFrame` using the `read_parquet` function: + {{code_block('user-guide/io/parquet','read',['read_parquet'])}} ## Write @@ -16,9 +16,10 @@ copying as we read `Parquet` directly into `Arrow` memory and _keep it there_. ## Scan -`Polars` allows you to _scan_ a `Parquet` input. Scanning delays the actual parsing of the -file and instead returns a lazy computation holder called a `LazyFrame`. +`Polars` allows you to _scan_ a `Parquet` input. Scanning delays the actual parsing of the file and instead returns a lazy computation holder called a `LazyFrame`. {{code_block('user-guide/io/parquet','scan',['scan_parquet'])}} If you want to know why this is desirable, you can read more about those `Polars` optimizations [here](../concepts/lazy-vs-eager.md). + +When we scan a `Parquet` file stored in the cloud, we can also apply predicate and projection pushdowns. This can significantly reduce the amount of data that needs to be downloaded. For scanning a Parquet file in the cloud, see [Cloud storage](cloud-storage.md/#scanning-from-cloud-storage-with-query-optimisation). diff --git a/docs/user-guide/lazy/query_plan.md b/docs/user-guide/lazy/query-plan.md similarity index 73% rename from docs/user-guide/lazy/query_plan.md rename to docs/user-guide/lazy/query-plan.md index bb57a74168de..c48a3f8a099c 100644 --- a/docs/user-guide/lazy/query_plan.md +++ b/docs/user-guide/lazy/query-plan.md @@ -8,17 +8,17 @@ For any lazy query `Polars` has both: We can understand both the non-optimized and optimized query plans with visualization and by printing them as text.
-```python exec="on" result="text" session="user-guide/lazy/query_plan" ---8<-- "python/user-guide/lazy/query_plan.py:setup" +```python exec="on" result="text" session="user-guide/lazy/query-plan" +--8<-- "python/user-guide/lazy/query-plan.py:setup" ```
Below we consider the following query: -{{code_block('user-guide/lazy/query_plan','plan',[])}} +{{code_block('user-guide/lazy/query-plan','plan',[])}} -```python exec="on" session="user-guide/lazy/query_plan" ---8<-- "python/user-guide/lazy/query_plan.py:plan" +```python exec="on" session="user-guide/lazy/query-plan" +--8<-- "python/user-guide/lazy/query-plan.py:plan" ``` ## Non-optimized query plan @@ -27,10 +27,10 @@ Below we consider the following query: First we visualise the non-optimized plan by setting `optimized=False`. -{{code_block('user-guide/lazy/query_plan','showplan',['show_graph'])}} +{{code_block('user-guide/lazy/query-plan','showplan',['show_graph'])}} -```python exec="on" session="user-guide/lazy/query_plan" ---8<-- "python/user-guide/lazy/query_plan.py:createplan" +```python exec="on" session="user-guide/lazy/query-plan" +--8<-- "python/user-guide/lazy/query-plan.py:createplan" ``` The query plan visualization should be read from bottom to top. In the visualization: @@ -43,10 +43,10 @@ The query plan visualization should be read from bottom to top. In the visualiza We can also print the non-optimized plan with `explain(optimized=False)` -{{code_block('user-guide/lazy/query_plan','describe',['explain'])}} +{{code_block('user-guide/lazy/query-plan','describe',['explain'])}} -```python exec="on" session="user-guide/lazy/query_plan" ---8<-- "python/user-guide/lazy/query_plan.py:describe" +```python exec="on" session="user-guide/lazy/query-plan" +--8<-- "python/user-guide/lazy/query-plan.py:describe" ``` ```text @@ -68,15 +68,15 @@ The printed plan should also be read from bottom to top. This non-optimized plan Now we visualize the optimized plan with `show_graph`. -{{code_block('user-guide/lazy/query_plan','show',['show_graph'])}} +{{code_block('user-guide/lazy/query-plan','show',['show_graph'])}} -```python exec="on" session="user-guide/lazy/query_plan" ---8<-- "python/user-guide/lazy/query_plan.py:createplan2" +```python exec="on" session="user-guide/lazy/query-plan" +--8<-- "python/user-guide/lazy/query-plan.py:createplan2" ``` We can also print the optimized plan with `explain` -{{code_block('user-guide/lazy/query_plan','optimized',['explain'])}} +{{code_block('user-guide/lazy/query-plan','optimized',['explain'])}} ```text WITH_COLUMNS: diff --git a/docs/user-guide/sql/select.md b/docs/user-guide/sql/select.md index 1c643895dec7..d994191c5068 100644 --- a/docs/user-guide/sql/select.md +++ b/docs/user-guide/sql/select.md @@ -9,39 +9,39 @@ FROM table_name; Here, `column1`, `column2`, etc. are the columns that you want to select from the table. You can also use the wildcard `*` to select all columns. `table_name` is the name of the table or that you want to retrieve data from. In the sections below we will cover some of the more common SELECT variants -{{code_block('user-guide/sql/sql_select','df',['SQLregister','SQLexecute'])}} +{{code_block('user-guide/sql/select','df',['SQLregister','SQLexecute'])}} ```python exec="on" result="text" session="user-guide/sql/select" ---8<-- "python/user-guide/sql/sql_select.py:setup" ---8<-- "python/user-guide/sql/sql_select.py:df" +--8<-- "python/user-guide/sql/select.py:setup" +--8<-- "python/user-guide/sql/select.py:df" ``` ### GROUP BY The `GROUP BY` statement is used to group rows in a table by one or more columns and compute aggregate functions on each group. -{{code_block('user-guide/sql/sql_select','group_by',['SQLexecute'])}} +{{code_block('user-guide/sql/select','group_by',['SQLexecute'])}} ```python exec="on" result="text" session="user-guide/sql/select" ---8<-- "python/user-guide/sql/sql_select.py:group_by" +--8<-- "python/user-guide/sql/select.py:group_by" ``` ### ORDER BY The `ORDER BY` statement is used to sort the result set of a query by one or more columns in ascending or descending order. -{{code_block('user-guide/sql/sql_select','orderby',['SQLexecute'])}} +{{code_block('user-guide/sql/select','orderby',['SQLexecute'])}} ```python exec="on" result="text" session="user-guide/sql/select" ---8<-- "python/user-guide/sql/sql_select.py:orderby" +--8<-- "python/user-guide/sql/select.py:orderby" ``` ### JOIN -{{code_block('user-guide/sql/sql_select','join',['SQLregister_many','SQLexecute'])}} +{{code_block('user-guide/sql/select','join',['SQLregister_many','SQLexecute'])}} ```python exec="on" result="text" session="user-guide/sql/select" ---8<-- "python/user-guide/sql/sql_select.py:join" +--8<-- "python/user-guide/sql/select.py:join" ``` ### Functions @@ -55,18 +55,18 @@ Polars provides a wide range of SQL functions, including: For a full list of supported functions go the [API documentation](https://docs.rs/polars-sql/latest/src/polars_sql/keywords.rs.html). The example below demonstrates how to use a function in a query -{{code_block('user-guide/sql/sql_select','functions',['SQLquery'])}} +{{code_block('user-guide/sql/select','functions',['SQLquery'])}} ```python exec="on" result="text" session="user-guide/sql/select" ---8<-- "python/user-guide/sql/sql_select.py:functions" +--8<-- "python/user-guide/sql/select.py:functions" ``` ### Table Functions In the examples earlier we first generated a DataFrame which we registered in the `SQLContext`. Polars also support directly reading from CSV, Parquet, JSON and IPC in your SQL query using table functions `read_xxx`. -{{code_block('user-guide/sql/sql_select','tablefunctions',['SQLexecute'])}} +{{code_block('user-guide/sql/select','tablefunctions',['SQLexecute'])}} ```python exec="on" result="text" session="user-guide/sql/select" ---8<-- "python/user-guide/sql/sql_select.py:tablefunctions" +--8<-- "python/user-guide/sql/select.py:tablefunctions" ``` diff --git a/mkdocs.yml b/mkdocs.yml index 425033e2eb19..501d047b35e5 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -55,16 +55,16 @@ nav: - user-guide/lazy/using.md - user-guide/lazy/optimizations.md - user-guide/lazy/schemas.md - - user-guide/lazy/query_plan.md + - user-guide/lazy/query-plan.md - user-guide/lazy/execution.md - user-guide/lazy/streaming.md - IO: - user-guide/io/csv.md - user-guide/io/parquet.md - - user-guide/io/json_file.md + - user-guide/io/json.md - user-guide/io/multiple.md - user-guide/io/database.md - - user-guide/io/aws.md + - user-guide/io/cloud-storage.md - user-guide/io/bigquery.md - SQL: - user-guide/sql/intro.md