dask-contrib · douglasdavis · Sep 7, 2023 · Sep 7, 2023 · Sep 7, 2023 · Sep 7, 2023
diff --git a/docs/api/io.rst b/docs/api/io.rst
@@ -13,14 +13,14 @@ IO
    from_lists
    from_map
    from_parquet
+   from_text
    to_dask_array
    to_dask_bag
    to_dataframe
    to_delayed
    to_parquet
    to_json
 
-
 .. raw:: html
 
    <script data-goatcounter="https://dask-awkward.goatcounter.com/count"

diff --git a/docs/conf.py b/docs/conf.py
@@ -44,6 +44,7 @@
     "sphinx.ext.todo",
     "sphinx.ext.autosectionlabel",
     "sphinx_design",
+    "sphinx_codeautolink",
 ]
 
 # Add any paths that contain templates here, relative to this directory.

diff --git a/docs/how-to/io.rst b/docs/how-to/io.rst
@@ -3,23 +3,21 @@ Data IO
 
 Creating dask-awkward collections typically begins with reading from
 either local disk or cloud storage. There is built-in support for
-datasets stored in Parquet or JSON format.
+datasets stored in Parquet or JSON format, along support for reading
+text files with each line treated as an element of an array.
 
 Take this code-block for example:
 
 .. code:: pycon
 
    >>> import dask_awkward as dak
    >>> ds1 = dak.from_parquet("s3://path/to/dataset")
-   >>> ds2 = dak.from_json("/path/to/json-files/*.json")
-
-Both the :py:func:`~dask_awkward.from_parquet` and
-:func:`~dask_awkward.from_json` calls will create new
-:class:`dask_awkward.Array` instances. In the Parquet example we will
-read data from Amazon S3; in the JSON example we're reading data from
-local disk (notice the wildcard syntax: all JSON files in that
-directory will be discovered, and each file will become a partition in
-the collection).
+   >>> ds2 = dak.from_json("/path/to/json-files")
+   >>> ds3 = dak.from_text("s3://some/text/*.txt")
+
+In the Parquet and text examples we will read data from Amazon S3; in
+the JSON example we're reading data from local disk. These collections
+will be partitioned on a per-file basis
 
 Support for the ROOT file format is provided by the Uproot_ project.
 

diff --git a/docs/more/optimization.rst b/docs/more/optimization.rst
@@ -59,6 +59,7 @@ If our task graph is of the form:
 
 .. code:: pycon
 
+   >>> import dask_awkward as dak
    >>> ds = dak.from_parquet("/path/to/data")
    >>> result = ds.bar.x / ds.foo
 
@@ -103,6 +104,7 @@ necessary columns):
 
 .. code:: pycon
 
+   >>> import dask_awkward as dak
    >>> dak.necessary_columns(result)
    {"some-layer-name": ["foo", "bar.x"]}
 
@@ -131,6 +133,8 @@ above example, we write
 
 .. code:: pycon
 
+   >>> import dask_awkward as dak
+   >>> import dask.config
    >>> ds = dak.from_parquet("/path/to/data", columns=["bar.x", "foo"])
    >>> result = ds.bar.x / ds.foo
    >>> with dask.config.set({"awkward.optimization.enabled": False}):

diff --git a/pyproject.toml b/pyproject.toml
@@ -49,6 +49,7 @@ docs = [
   "dask-awkward[complete]",
   "dask-sphinx-theme >=3.0.2",
   "sphinx-design",
+  "sphinx-codeautolink",
   "requests >=2.27.1",
 ]
 test = [

diff --git a/src/dask_awkward/lib/core.py b/src/dask_awkward/lib/core.py
@@ -2307,7 +2307,7 @@ def partition_compatibility(*args: Array) -> PartitionCompatibility:
     >>> import dask_awkward as dak
     >>> import awkward as ak
     >>> concrete = ak.Array([[1, 2, 3], [4], [5, 6], [0, 0, 0, 0]])
-    >>> lazy = dak.from_awkward(concrete npartitions=2)
+    >>> lazy = dak.from_awkward(concrete, npartitions=2)
     >>> selection = dak.sum(lazy, axis=1) == 0
     >>> dak.partition_compatibility(lazy, selection)
     <PartitionCompatibility.YES: 0>

diff --git a/src/dask_awkward/lib/io/text.py b/src/dask_awkward/lib/io/text.py
@@ -57,13 +57,43 @@ def from_text(
     source: str | list[str],
     blocksize: str | int = "128 MiB",
     delimiter: bytes = b"\n",
-    sample_size: str | int = "128 KiB",
     compression: str | None = "infer",
     storage_options: dict | None = None,
 ) -> Array:
+    """Create an Array collection from text data and a delimiter.
+
+    The default behavior of this input function is to create an array
+    collection where elements are seperated by newlines.
+
+    Parameters
+    ----------
+    source : str | list[str]
+        Data source as a list of files or a single path (can be remote
+        files).
+    blocksize : str | int
+        Size of each partition in bytes.
+    delimiter : bytes
+        Delimiter to separate elements of the array (default is
+        newline character).
+    compression : str, optional
+        Compression of the files for reading (default is to infer).
+    storage_options : dict, optional
+        Storage options passed to the ``fsspec`` filesystem.
+
+    Returns
+    -------
+    Array
+        Resulting collection.
+
+    Examples
+    --------
+    >>> import dask_awkward as dak
+    >>> ds = dak.from_text("s3://path/to/files/*.txt", blocksize="256 MiB")
+
+    """
     fs, token, paths = get_fs_token_paths(source, storage_options=storage_options or {})
 
-    token = tokenize(source, token, blocksize, delimiter, sample_size)
+    token = tokenize(source, token, blocksize, delimiter, compression)
 
     if compression == "infer":
         compression = infer_compression(paths[0])