douglasdavis · douglasdavis · Oct 31, 2022 · Mar 8, 2023 · Mar 8, 2023 · Jun 5, 2023
diff --git a/src/dask_awkward/__init__.py b/src/dask_awkward/__init__.py
@@ -24,12 +24,13 @@
     from_delayed,
     from_lists,
     from_map,
+    from_text,
     to_dask_array,
     to_dask_bag,
     to_dataframe,
     to_delayed,
 )
-from dask_awkward.lib.io.json import from_json, to_json
+from dask_awkward.lib.io.json import from_json, layout_to_jsonschema, to_json
 from dask_awkward.lib.io.parquet import from_parquet, to_parquet
 from dask_awkward.lib.operations import concatenate
 from dask_awkward.lib.reducers import (

diff --git a/src/dask_awkward/lib/__init__.py b/src/dask_awkward/lib/__init__.py
@@ -15,6 +15,7 @@
     from_delayed,
     from_lists,
     from_map,
+    from_text,
     to_dask_array,
     to_dask_bag,
     to_dataframe,

diff --git a/src/dask_awkward/lib/_utils.py b/src/dask_awkward/lib/_utils.py
@@ -42,6 +42,10 @@ def set_form_keys(form: Form, *, key: str) -> Form:
     elif form.is_numpy:
         form.form_key = key
 
+    elif form.is_union:
+        for entry in form.contents:
+            set_form_keys(entry, key=key)
+
     # Anything else grab the content and keep recursing
     else:
         set_form_keys(form.content, key=key)

diff --git a/src/dask_awkward/lib/core.py b/src/dask_awkward/lib/core.py
@@ -1089,7 +1089,17 @@ def _getitem_single(self, where: Any) -> Array:
 
         raise DaskAwkwardNotImplemented(f"__getitem__ doesn't support where={where}.")
 
-    def __getitem__(self, where: Any) -> AwkwardDaskCollection:
+    @overload
+    def __getitem__(self, where: Array | str | Sequence[str] | slice) -> Array:
+        ...
+
+    @overload
+    def __getitem__(self, where: int) -> Scalar:
+        ...
+
+    def __getitem__(
+        self, where: Array | str | Sequence[str] | int | slice
+    ) -> Array | Scalar:
         """Select items from the collection.
 
         Heavily under construction.

diff --git a/src/dask_awkward/lib/io/io.py b/src/dask_awkward/lib/io/io.py
@@ -9,7 +9,8 @@
 from awkward.types.numpytype import primitive_to_dtype
 from dask.base import flatten, tokenize
 from dask.highlevelgraph import HighLevelGraph
-from dask.utils import funcname
+from dask.utils import funcname, is_integer, parse_bytes
+from fsspec.utils import read_block
 
 from dask_awkward.layers import AwkwardBlockwiseLayer, AwkwardInputLayer
 from dask_awkward.layers.layers import AwkwardMaterializedLayer
@@ -25,6 +26,7 @@
     from dask.bag.core import Bag as DaskBag
     from dask.dataframe.core import DataFrame as DaskDataFrame
     from dask.delayed import Delayed
+    from fsspec.spec import AbstractFileSystem
 
     from dask_awkward.lib.core import Array
 
@@ -572,3 +574,150 @@ def from_map(
         )
 
     return result
+
+
+def bytes_reading_ingredients(
+    fs: AbstractFileSystem,
+    paths: list[str],
+    compression: str | None,
+    delimiter: bytes | None,
+    not_zero: bool,
+    blocksize: str | int,
+    sample: str | int,
+) -> tuple[list[tuple], bytes]:
+    if blocksize is not None:
+        if isinstance(blocksize, str):
+            blocksize = parse_bytes(blocksize)
+        if not is_integer(blocksize):
+            raise TypeError("blocksize must be an integer")
+        blocksize = int(blocksize)
+
+    if blocksize is None:
+        offsets = [[0]] * len(paths)
+        lengths = [[None]] * len(paths)
+    else:
+        offsets = []
+        lengths = []
+        for path in paths:
+            if compression is not None:
+                raise ValueError(
+                    "Cannot do chunked reads on compressed files. "
+                    "To read, set blocksize=None"
+                )
+            size = fs.info(path)["size"]
+            if size is None:
+                raise ValueError(
+                    "Backing filesystem couldn't determine file size, cannot "
+                    "do chunked reads. To read, set blocksize=None."
+                )
+
+            elif size == 0:
+                # skip empty
+                offsets.append([])
+                lengths.append([])
+            else:
+                # shrink blocksize to give same number of parts
+                if size % blocksize and size > blocksize:
+                    blocksize1 = size / (size // blocksize)
+                else:
+                    blocksize1 = blocksize
+                place = 0
+                off = [0]
+                length = []
+
+                # figure out offsets, spreading around spare bytes
+                while size - place > (blocksize1 * 2) - 1:
+                    place += blocksize1
+                    off.append(int(place))
+                    length.append(off[-1] - off[-2])
+                length.append(size - off[-1])
+
+                if not_zero:
+                    off[0] = 1
+                    length[0] -= 1
+                offsets.append(off)
+                lengths.append(length)
+
+    out = []
+    for path, offset, length in zip(paths, offsets, lengths):
+        values = [
+            (
+                fs,
+                path,
+                compression,
+                offs,
+                leng,
+                delimiter,
+            )
+            for offs, leng in zip(offset, length)
+        ]
+        out.append(values)
+
+    sample_size = parse_bytes(sample) if isinstance(sample, str) else sample
+    with fs.open(paths[0], compression=compression) as f:
+        # read block without seek (because we start at zero)
+        if delimiter is None:
+            sample_bytes = f.read(sample_size)
+        else:
+            sample_buff = f.read(sample_size)
+            while True:
+                new = f.read(sample_size)
+                if not new:
+                    break
+                if delimiter in new:
+                    sample_buff = sample_buff + new.split(delimiter, 1)[0] + delimiter
+                    break
+                sample_buff = sample_buff + new
+            sample_bytes = sample_buff
+
+    rfind = sample_bytes.rfind(delimiter)
+    if rfind > 0:
+        sample_bytes = sample_bytes[:rfind]
+
+    return out, sample_bytes
+
+
+class FromTextFn:
+    def __init__(self):
+        pass
+
+    def __call__(self, ingredients: tuple) -> ak.Array:
+        (fs, path, compression, offsets, length, delimiter) = ingredients
+
+        with fs.open(path, compression=compression) as f:
+            if offsets == 0 and length is None:
+                bytestring = f.read()
+            else:
+                bytestring = read_block(f, offsets, length, delimiter)
+
+            buffer = np.frombuffer(bytestring, dtype=np.uint8)
+            array = ak.from_numpy(buffer)
+            array = ak.unflatten(array, len(array))
+            array = ak.enforce_type(array, "string")
+            array_split = ak.str.split_pattern(array, "\n")
+            lines = array_split[0]
+            return lines
+
+
+def from_text(source, blocksize, delimiter, storage_options: dict | None = None):
+    from fsspec.core import get_fs_token_paths
+
+    fs, token, paths = get_fs_token_paths(source, storage_options=storage_options or {})
+
+    bytes_ingredients, sample_bytes = bytes_reading_ingredients(
+        fs,
+        paths,
+        None,
+        delimiter,
+        False,
+        blocksize,
+        "128 KiB",
+    )
+
+    return from_map(
+        FromTextFn(),
+        list(flatten(bytes_ingredients)),
+        label="from-text",
+        token=token,
+        meta=None,
+    )