Skip to content

Commit

Permalink
Merge pull request #2053 from mr-majkel/parquet_writer
Browse files Browse the repository at this point in the history
[parquet] add parquet writer [#2044]
  • Loading branch information
anjakefala authored Oct 18, 2023
2 parents 3e23624 + e0b63b9 commit e9c0d94
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 6 deletions.
Binary file added sample_data/sample.parquet
Binary file not shown.
44 changes: 44 additions & 0 deletions tests/golden/load-parquet.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
OrderDate Region Rep Item Units Unit_Cost Total
2016-01-06 East Jones Pencil 95 1.99 189.05
2016-01-23 Central Kivell Binder 50 19.99 999.50
2016-02-09 Central Jardine Pencil 36 4.99 179.64
2016-02-26 Central Gill Pen 27 19.99 539.73
2016-03-15 West Sorvino Pencil 56 2.99 167.44
2016-04-01 East Jones Binder 60 4.99 299.40
2016-04-18 Central Andrews Pencil 75 1.99 149.25
2016-05-05 Central Jardine Pencil 90 4.99 449.10
2016-05-22 West Thompson Pencil 32 1.99 63.68
2016-06-08 East Jones Binder 60 8.99 539.40
2016-06-25 Central Morgan Pencil 90 4.99 449.10
2016-07-12 East Howard Binder 29 1.99 57.71
2016-07-29 East Parent Binder 81 19.99 1619.19
2016-08-15 East Jones Pencil 35 4.99 174.65
2016-09-01 Central Smith Desk 2 125.00 250.00
2016-09-18 East Jones Pen Set 16 15.99 255.84
2016-10-05 Central Morgan Binder 28 8.99 251.72
2016-10-22 East Jones Pen 64 8.99 575.36
2016-11-08 East Parent Pen 15 19.99 299.85
2016-11-25 Central Kivell Pen Set 96 4.99 479.04
2016-12-12 Central Smith Pencil 67 1.29 86.43
2016-12-29 East Parent Pen Set 74 15.99 1183.26
2017-01-15 Central Gill Binder 46 8.99 413.54
2017-02-01 Central Smith Binder 87 15.00 1305.00
2017-02-18 East Jones Binder 4 4.99 19.96
2017-03-07 West Sorvino Binder 7 19.99 139.93
2017-03-24 Central Jardine Pen Set 50 4.99 249.50
2017-04-10 Central Andrews Pencil 66 1.99 131.34
2017-04-27 East Howard Pen 96 4.99 479.04
2017-05-14 Central Gill Pencil 53 1.29 68.37
2017-05-31 Central Gill Binder 80 8.99 719.20
2017-06-17 Central Kivell Desk 5 125.00 625.00
2017-07-04 East Jones Pen Set 62 4.99 309.38
2017-07-21 Central Morgan Pen Set 55 12.49 686.95
2017-08-07 Central Kivell Pen Set 42 23.95 1005.90
2017-08-24 West Sorvino Desk 3 275.00 825.00
2017-09-10 Central Gill Pencil 7 1.29 9.03
2017-09-27 West Sorvino Pen 76 1.99 151.24
2017-10-14 West Thompson Binder 57 19.99 1139.43
2017-10-31 Central Andrews Pencil 14 1.29 18.06
2017-11-17 Central Jardine Binder 11 4.99 54.89
2017-12-04 Central Jardine Binder 94 19.99 1879.06
2017-12-21 Central Andrews Binder 28 4.99 139.72
2 changes: 2 additions & 0 deletions tests/load-parquet.vd
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
sheet col row longname input keystrokes comment
open-file sample_data/sample.parquet o Open file or URL
55 changes: 49 additions & 6 deletions visidata/loaders/parquet.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,71 @@
from visidata import VisiData, Sheet, Column, vd
from visidata import Sheet, VisiData, TypedWrapper, anytype, date, vlen, Column, vd
from collections import defaultdict


@VisiData.api
def open_parquet(vd, p):
return ParquetSheet(p.name, source=p)


class ParquetColumn(Column):
def calcValue(self, row):
return self.source[row['__rownum__']].as_py()
return self.source[row["__rownum__"]].as_py()


class ParquetSheet(Sheet):
# rowdef: {'__rownum__':int, parquet_col:overridden_value, ...}
def iterload(self):
pq = vd.importExternal('pyarrow.parquet', 'pyarrow')
pq = vd.importExternal("pyarrow.parquet", "pyarrow")
from visidata.loaders.arrow import arrow_to_vdtype

self.tbl = pq.read_table(str(self.source))
self.columns = []
for colname, col in zip(self.tbl.column_names, self.tbl.columns):
c = ParquetColumn(colname,
type=arrow_to_vdtype(col.type),
source=col)
c = ParquetColumn(colname, type=arrow_to_vdtype(col.type), source=col)
self.addColumn(c)

for i in range(self.tbl.num_rows):
yield dict(__rownum__=i)


@VisiData.api
def save_parquet(vd, p, sheet):
pa = vd.importExternal("pyarrow")
pq = vd.importExternal("pyarrow.parquet", "pyarrow")

typemap = {
anytype: pa.string(),
int: pa.int64(),
vlen: pa.int64(),
float: pa.float64(),
str: pa.string(),
date: pa.date64(),
# list: pa.array(),
}

for t in vd.numericTypes:
if t not in typemap:
typemap[t] = pa.float64()

databycol = defaultdict(list) # col -> [values]

for typedvals in sheet.iterdispvals(format=False):
for col, val in typedvals.items():
if isinstance(val, TypedWrapper):
val = None

databycol[col].append(val)

data = [
pa.array(vals, type=typemap.get(col.type, pa.string()))
for col, vals in databycol.items()
]

schema = pa.schema(
[(c.name, typemap.get(c.type, pa.string())) for c in sheet.visibleCols]
)
with p.open_bytes(mode="w") as outf:
with pq.ParquetWriter(outf, schema) as writer:
writer.write_batch(
pa.record_batch(data, names=[c.name for c in sheet.visibleCols])
)

0 comments on commit e9c0d94

Please sign in to comment.