From 3a284a5ea97ebe0ef500c9911eaeddebe88ad741 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 7 Oct 2024 17:05:34 +0100 Subject: [PATCH] feat: Adds `.arrow` support To support [flights-200k.arrow](https://github.com/vega/vega-datasets/blob/f637f85f6a16f4b551b9e2eb669599cc21d77e69/data/flights-200k.arrow) --- tools/vendor_datasets.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tools/vendor_datasets.py b/tools/vendor_datasets.py index 08c3094e7..26e1207c4 100644 --- a/tools/vendor_datasets.py +++ b/tools/vendor_datasets.py @@ -8,6 +8,7 @@ from __future__ import annotations import sys +import tempfile from functools import cached_property, partial from pathlib import Path from typing import Any, Callable, ClassVar, Literal @@ -29,11 +30,11 @@ _OLD_SOURCE_TAG = "v1.29.0" # 5 years ago _CURRENT_SOURCE_TAG = "v2.9.0" -ExtSupported: TypeAlias = Literal[".csv", ".json", ".tsv"] +ExtSupported: TypeAlias = Literal[".csv", ".json", ".tsv", ".arrow"] def is_ext_supported(suffix: str) -> TypeIs[ExtSupported]: - return suffix in {".csv", ".json", ".tsv"} + return suffix in {".csv", ".json", ".tsv", ".arrow"} def _py_to_js(s: str, /): @@ -49,6 +50,7 @@ class Dataset: ".csv": pl.read_csv, ".json": pl.read_json, ".tsv": partial(pl.read_csv, separator="\t"), + ".arrow": partial(pl.read_ipc, use_pyarrow=True), } def __init__(self, name: str, /, base_url: str) -> None: @@ -63,9 +65,10 @@ def __init__(self, name: str, /, base_url: str) -> None: self.url: str = f"{base_url}{file_name}" def __call__(self, **kwds: Any) -> pl.DataFrame: - with urlopen(self.url) as f: - fn = self.read_fn[self.extension] - content = fn(f, **kwds) + fn = self.read_fn[self.extension] + with tempfile.NamedTemporaryFile() as tmp, urlopen(self.url) as f: + tmp.write(f.read()) + content = fn(tmp, **kwds) return content def __repr__(self) -> str: