From 4071c6b2de320d2e5a5bef1a32eba47c3e4bfc80 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Mon, 10 Jun 2024 08:23:51 -0400 Subject: [PATCH 1/2] keep strict manifests on root tile; typing improvements --- src/Deeptable.ts | 7 +- src/TixRixQid.ts | 183 +++++++++++++++++++++++++++++++++++++++++++++++ src/selection.ts | 54 +++++++------- src/shared.d.ts | 12 +++- src/tile.ts | 48 +++++++------ src/typing.ts | 2 +- tsconfig.json | 2 +- 7 files changed, 256 insertions(+), 52 deletions(-) create mode 100644 src/TixRixQid.ts diff --git a/src/Deeptable.ts b/src/Deeptable.ts index 57ba691a7..f3065c0bd 100644 --- a/src/Deeptable.ts +++ b/src/Deeptable.ts @@ -137,9 +137,10 @@ export class Deeptable { this.promise = preProcessRootTile.then(async () => { const batch = await this.root_tile.get_arrow(null); const schema = batch.schema; - this.root_tile.manifest = - await this.root_tile.deriveManifestInfoFromTileMetadata(); - + if (!tileManifest) { + this.root_tile.manifest = + await this.root_tile.deriveManifestInfoFromTileMetadata(); + } if (schema.metadata.has('sidecars')) { const cars = schema.metadata.get('sidecars'); if (typeof cars !== 'string') diff --git a/src/TixRixQid.ts b/src/TixRixQid.ts new file mode 100644 index 000000000..730b765b7 --- /dev/null +++ b/src/TixRixQid.ts @@ -0,0 +1,183 @@ +import type { Bool, StructRowProxy, Vector } from 'apache-arrow'; + +import type { Tile } from './deepscatter'; +import { Bitmask, DataSelection, Deeptable } from './deepscatter'; + +// The type below indicates that a Qid is not valid if +// there are zero rows selected in the tile. + +// A Tix is a tile index, which is an integer identifier for a tile in quadtree. +// It uses the formula (4^z - 1) / 3 + y * 2^z + x, where z is the zoom level, +// and x and y are the tile coordinates. + +type Tix = number; + +// An Rix is a row index, which is an integer identifier for a row in a tile. +type Rix = number; + +// A Rixen is a list of row indices. It must be non-empty. +type Rixen = [Rix, ...Rix[]]; + +// A Qid is a pair of a Tix and a Rixen. It identifies a set of rows in a tile. +export type Qid = [Tix, Rixen]; +export type QidArray = Qid[]; + +export function zxyToTix(z: number, x: number, y: number) { + return (4 ** z - 1) / 3 + y * 2 ** z + x; +} + +function parentTix(tix: number) { + const [z, x, y] = tixToZxy(tix); + return zxyToTix(z - 1, Math.floor(x / 2), Math.floor(y / 2)); +} + +/** + * + * @param tix The numeric tile index + * @param deeptable The deepscatter dataset + * @returns The tile, if it exists. + * + */ +export async function tixToTile(tix: Tix, deeptable: Deeptable) { + if (tix === 0) { + return deeptable.root_tile; + } + if (isNaN(tix)) { + throw new Error('NaN tile index'); + } + // We need all parents to exist to find their children. So + // we fetch the tiles here to ensure they've loaded. + const parent = await tixToTile(parentTix(tix), deeptable); + // + await parent.populateManifest(); + // Now that the parents are loaded, we can find the child. + const [z, x, y] = tixToZxy(tix); + const key = `${z}/${x}/${y}`; + const t = deeptable.map((d) => d).filter((d) => d.key === key); + if (t.length) { + return t[0]; + } + throw new Error(`Tile ${key} not found in dataset.`); +} + +/** + * + * @param qid a quadtree id + * @param dataset + * @returns + */ +export async function qidToRowProxy(qid: Qid, dataset: Deeptable) { + const tile = await tixToTile(qid[0], dataset); + await tile.get_column('x'); + return tile.record_batch.get(qid[1][0]); +} + +export function tileKey_to_tix(key: string) { + const [z, x, y] = key.split('/').map((d) => parseInt(d)); + return zxyToTix(z, x, y); +} + +export function tixToZxy(tix: Tix): [number, number, number] { + // This is the inverse function that goes from a quadtree tile's integer identifier 'qix' to the [z, x, y] tuple. + + // The z level is the inverse of the qix function. + // Javascript doesn't have base-4 logarithm I guess, so we divide the natural log by the natural log of 4. + const z = Math.floor(Math.log(tix * 3 + 1) / Math.log(4)); + + // We then get the index inside the tile, which is the offset from the base sequence. + const blockPosition = tix - (4 ** z - 1) / 3; + + // Modulo operations turn this into x and y coordinates. + const x = blockPosition % 2 ** z; + const y = Math.floor(blockPosition / 2 ** z); + return [z, x, y]; +} + +/** + * + * @param row the row returned from a point event, etc. + * @param dataset a deepscatter dataset. + * @returns + */ +export function getQidFromRow( + row: StructRowProxy, + dataset: Deeptable, +): [number, number] { + const tile = getTileFromRow(row, dataset); + const rix = row[Symbol.for('rowIndex')] as number; + return [tileKey_to_tix(tile.key), rix] as [number, number]; +} + +export function getTileFromRow(row: StructRowProxy, dataset: Deeptable): Tile { + const ix = row.ix as bigint; + const rix = row[Symbol.for('rowIndex')] as number; + const matches = dataset + .map((tile) => tile) + .filter((t) => { + try { + t.record_batch; + } catch (err) { + return false; + } + const relatedRow = t.record_batch.get(rix); + if (relatedRow === null) { + return false; + } + return relatedRow.ix === ix; + }); + + if (matches.length === 0) { + throw new Error('No tiles found for this row.'); + } + return matches[0]; +} + +export function getQidArrayFromRows( + rows: StructRowProxy[], + dataset: Deeptable, +): QidArray { + // TODO: this is really inefficient. We should be able to do this in one pass. + const qids = rows.map((row) => getQidFromRow(row, dataset)); + const mapped = new Map(); + for (const qid of qids) { + if (mapped.has(qid[0])) { + mapped.get(qid[0]).push(qid[1]); + } else { + mapped.set(qid[0], [qid[1]]); + } + } + return Array.from(mapped.entries()); +} + +export function selectQixOnTile(tile: Tile, qidList: QidArray) { + const mask = new Bitmask(tile.record_batch.numRows); + const [z, x, y] = tile.key.split('/').map((d) => parseInt(d)); + const tix = zxyToTix(z, x, y); + const rixes = qidList + .filter((d) => d[0] === tix) + .map((d) => d[1]) + .flat(); + for (const rix of rixes) { + mask.set(rix); + } + return mask.to_arrow(); +} + +/** + * + * @param hoverDatum A struct row. + * @param selection A DataSelection + * @param deeptable A Deepscatter dataset + * @returns + */ +export async function isDatumInSelection( + hoverDatum: StructRowProxy, + selection: DataSelection | null, + deeptable: Deeptable, +): Promise { + if (!selection) return false; + const [tix, rix] = getQidFromRow(hoverDatum, deeptable); + const owningTile = await tixToTile(tix, deeptable); + const array = (await owningTile.get_column(selection.name)) as Vector; + return !!array.get(rix); +} diff --git a/src/selection.ts b/src/selection.ts index e5795d5c6..af4c7b9a7 100644 --- a/src/selection.ts +++ b/src/selection.ts @@ -1,3 +1,4 @@ +/* eslint-disable no-constant-condition */ import { Deeptable } from './Deeptable'; import { Scatterplot } from './scatterplot'; import { Tile } from './tile'; @@ -7,6 +8,7 @@ import { DataType, StructRowProxy, Type, + Utf8, Vector, makeData, } from 'apache-arrow'; @@ -81,9 +83,9 @@ export interface CompositeSelectParams extends SelectParams { } function isCompositeSelectParam( - params: Record, + params: CompositeSelectParams | BooleanColumnParams | IdSelectParams, ): params is CompositeSelectParams { - return params.composition !== undefined; + return (params as CompositeSelectParams).composition !== undefined; } function isComposition(elems: unknown): elems is Composition { @@ -99,9 +101,7 @@ async function extractBitmask(tile: Tile, arg: CompArgs): Promise { if (isComposition(arg)) { return applyCompositeFunctionToTile(tile, arg); } else { - const column = tile.get_column((arg as DataSelection).name) as Promise< - Vector - >; + const column = tile.get_column(arg.name) as Promise>; return Bitmask.from_arrow(await column); } } @@ -130,7 +130,7 @@ async function applyCompositeFunctionToTile( } else if (isPluralSelectOperator(operator)) { const op = args[0]; const bitmasks = await Promise.all( - args.slice(1).map((arg) => extractBitmask(tile, arg)), + args.slice(1).map((arg: CompArgs) => extractBitmask(tile, arg)), ); const accumulated = bitmasks .slice(1) @@ -173,9 +173,13 @@ function isBinarySelectOperation( } function isFunctionSelectParam( - params: Record, + params: + | CompositeSelectParams + | BooleanColumnParams + | IdSelectParams + | FunctionSelectParams, ): params is FunctionSelectParams { - return params.tileFunction !== undefined; + return (params as FunctionSelectParams).tileFunction !== undefined; } /** @@ -364,7 +368,7 @@ export class DataSelection { } else if (isCompositeSelectParam(params)) { const { name, composition } = params; this.composition = composition; - this.add_function_column(name, async (tile: Tile) => { + void this.add_function_column(name, async (tile: Tile) => { const bitmask = await applyCompositeFunctionToTile(tile, composition); return bitmask.to_arrow(); }).then(markReady); @@ -377,14 +381,14 @@ export class DataSelection { * @param listener a function to call back. It takes * as an argument the `tile` that was just added. */ - on(event: string, listener: (args: any) => void): void { + on(event: string, listener: (args: unknown) => void): void { if (!this.events[event]) { this.events[event] = []; } this.events[event].push(listener); } - private dispatch(event: string, args: any): void { + private dispatch(event: string, args: unknown): void { if (this.events[event]) { this.events[event].forEach((listener) => listener(args)); } @@ -513,8 +517,8 @@ export class DataSelection { * * @param fields A list of fields in the data to export. */ - async export(fields: string[], format: 'json' = 'json') { - /* + // async export(fields: string[], format: 'json' = 'json') { + /* This would have benefits, but might fetch data we don't actually need. const preparation = [] @@ -525,14 +529,14 @@ export class DataSelection { } await Promise.all(preparation) */ - const columns = Object.fromEntries(fields.map((field) => [field, []])); - for (let row of this) { - for (let field of fields) { - columns[field].push(row[field]); - } - } - return columns; - } + // const columns = Object.fromEntries(fields.map((field) => [field, []])); + // for (let row of this) { + // for (let field of fields) { + // columns[field].push(row[field]); + // } + // } + // return columns; + // } public moveCursorToPoint( point: StructRowProxy<{ ix: DataType }>, @@ -857,7 +861,7 @@ function stringmatcher(field: string, matches: string[]) { if (!node[byte]) { node[byte] = []; } - node = node[byte] as TrieArray; + node = node[byte]; } // Mark the end of a Uint8Array with a special property @@ -878,8 +882,8 @@ function stringmatcher(field: string, matches: string[]) { * The Deepscatter transformation function. */ return async function (tile: Tile) { - const col = (await tile.get_column(field)).data[0]; - const bytes = col.values as Uint8Array; + const col = ((await tile.get_column(field)) as Vector).data[0]; + const bytes = col.values; const offsets = col.valueOffsets; // Initialize results as a Float32Array with the same @@ -893,7 +897,7 @@ function stringmatcher(field: string, matches: string[]) { let node = trie; for (let i = 0; i < len; i++) { const byte = bytes[start + i]; - node = node[byte] as TrieArray; + node = node[byte]; // If the node for this byte doesn't exist, the slice doesn't exist in the trie if (!node) { return false; diff --git a/src/shared.d.ts b/src/shared.d.ts index c754bdfa2..672a4a7e4 100644 --- a/src/shared.d.ts +++ b/src/shared.d.ts @@ -22,7 +22,6 @@ import { ZoomTransform } from 'd3-zoom'; import { TileBufferManager } from './regl_rendering'; import type { Tile } from './tile'; import type { Rectangle } from './tile'; -import { ScaleLinear } from 'd3-scale'; export type { Renderer, Deeptable, ConcreteAesthetic }; export type BufferLocation = { @@ -61,11 +60,20 @@ export type ScatterplotOptions = { // allow certain optimizations. export type TileStructure = 'quadtree' | 'other'; +export type LazyTileManifest = { + key: string; + // The number of data points in that specific tile. + nPoints: number; + children: string[]; + min_ix: number; + max_ix: number; + extent: Rectangle; +}; export type TileManifest = { key: string; // The number of data points in that specific tile. nPoints: number; - children: TileManifest[] | string[]; + children: TileManifest[]; min_ix: number; max_ix: number; extent: Rectangle; diff --git a/src/tile.ts b/src/tile.ts index 9972ec5ff..6a8983fea 100644 --- a/src/tile.ts +++ b/src/tile.ts @@ -22,7 +22,7 @@ export type Rectangle = { // } import type { TileBufferManager } from './regl_rendering'; -import type { ArrowBuildable, TileManifest } from './shared'; +import type { ArrowBuildable, LazyTileManifest, TileManifest } from './shared'; import { isCompleteManifest } from './typing'; export type RecordBatchCache = @@ -59,10 +59,10 @@ export class Tile { public _highest_known_ix?: number; public deeptable: Deeptable; public _transformations: Record> = {}; - public _deriveManifestFromTileMetadata?: Promise; + public _deriveManifestFromTileMetadata?: Promise; //private _promiseOfChildren? = Promise; - private _partialManifest: Partial; - private _completeManifest?: TileManifest; + private _partialManifest: Partial | Partial; + private _manifest?: TileManifest | LazyTileManifest; // A cache of fetchCalls for downloaded arrow tables, including any table schema metadata. // Tables may contain more than a single column, so this prevents multiple dispatch. @@ -82,12 +82,17 @@ export class Tile { * @param deeptable The full atlas deeptable of which this tile is a part. */ constructor( - key: string | (Partial & { key: string }), + key: + | string + | (Partial & { key: string }) + | Partial, parent: Tile | null, deeptable: Deeptable, ) { // If it's just initiated with a key, build that into a minimal manifest. - let manifest: Partial & { key: string }; + let manifest: + | (Partial & { key: string }) + | Partial; if (typeof key === 'string') { manifest = { key }; } else { @@ -106,12 +111,12 @@ export class Tile { if (deeptable === undefined) { throw new Error('No deeptable provided'); } + // Grab the next identifier off the queue. This should be async safe with the current setup, but // the logic might fall apart in truly parallel situations. this.numeric_id = tile_identifier++; if (isCompleteManifest(manifest)) this.manifest = manifest; - this._partialManifest = manifest; } @@ -256,14 +261,14 @@ export class Tile { } } - get manifest(): TileManifest { - if (!this._completeManifest) + get manifest(): TileManifest | LazyTileManifest { + if (!this._manifest) throw new Error('Attempted to access manifest on partially loaded tile.'); - return this._completeManifest; + return this._manifest; } - set manifest(manifest: TileManifest) { + set manifest(manifest: TileManifest | LazyTileManifest) { // Setting the manifest is the thing that spawns children. if (!manifest.children) { console.error({ manifest }); @@ -273,7 +278,7 @@ export class Tile { return new Tile(k, this, this.deeptable); }); this.highest_known_ix = manifest.max_ix; - this._completeManifest = manifest; + this._manifest = manifest; } set highest_known_ix(val) { @@ -314,7 +319,7 @@ export class Tile { } get min_ix() { - if (this._completeManifest && this.manifest?.min_ix !== undefined) { + if (this._manifest && this.manifest?.min_ix !== undefined) { return this.manifest.min_ix; } if (this.parent) { @@ -332,7 +337,7 @@ export class Tile { } get extent(): Rectangle { - if (this._completeManifest && this.manifest?.extent) { + if (this._manifest && this.manifest?.extent) { return this.manifest.extent; } return this.theoretical_extent; @@ -437,7 +442,7 @@ export class Tile { * @returns void */ async populateManifest(): Promise { - if (this._completeManifest) { + if (this._manifest) { return; } else if (this._partialManifest.children) { if (this._partialManifest.nPoints === undefined) { @@ -446,7 +451,7 @@ export class Tile { this.manifest = { ...this._partialManifest, key: this.key, - children: this._partialManifest.children, + children: this._partialManifest.children as string[], min_ix: this.min_ix, max_ix: this.max_ix, extent: this.extent, @@ -482,13 +487,15 @@ export class Tile { }); } - async deriveManifestInfoFromTileMetadata(): Promise { + async deriveManifestInfoFromTileMetadata(): Promise< + TileManifest | LazyTileManifest + > { // This should only be called once per tile. if (this._deriveManifestFromTileMetadata !== undefined) { return this._deriveManifestFromTileMetadata; } - const manifest: Partial = {}; + const manifest: Partial = {}; this._deriveManifestFromTileMetadata = this.get_arrow(null).then( async (batch) => { // For every column in the root tile, @@ -521,7 +528,8 @@ export class Tile { const children = metadata.get('children'); if (children) { - manifest.children = JSON.parse(children) as TileManifest[] | string[]; + const stringChildren = JSON.parse(children) as string[]; + manifest.children = stringChildren; } // TODO: make ix optionally parsed from metadata, not column. @@ -548,7 +556,7 @@ export class Tile { max_ix: manifest.max_ix, extent: manifest.extent, nPoints: batch.numRows, - } as TileManifest; + } as const; return fullManifest; }, ); diff --git a/src/typing.ts b/src/typing.ts index 3ed081804..823fc3b9f 100644 --- a/src/typing.ts +++ b/src/typing.ts @@ -36,7 +36,7 @@ export function isLabelset(labels: DS.Labelcall): labels is DS.Labelset { // There must be a general function here huh. export function isCompleteManifest( - manifest: Partial, + manifest: Partial | Partial, ): manifest is DS.TileManifest { for (const k of [ 'key', diff --git a/tsconfig.json b/tsconfig.json index a2293466a..d82263aad 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -20,7 +20,7 @@ "outDir": "./dist", "rootDir": "./src", "lib": ["DOM"], - "noEmitOnError": false, + "noEmitOnError": true, "emitDeclarationOnly": true, }, "$schema": "https://json.schemastore.org/tsconfig", From 2cc4bade9ad51a1c36fea5d43d5c605f6e55a8c6 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Mon, 10 Jun 2024 08:29:10 -0400 Subject: [PATCH 2/2] remove tixRixQid --- src/TixRixQid.ts | 183 ----------------------------------------------- 1 file changed, 183 deletions(-) delete mode 100644 src/TixRixQid.ts diff --git a/src/TixRixQid.ts b/src/TixRixQid.ts deleted file mode 100644 index 730b765b7..000000000 --- a/src/TixRixQid.ts +++ /dev/null @@ -1,183 +0,0 @@ -import type { Bool, StructRowProxy, Vector } from 'apache-arrow'; - -import type { Tile } from './deepscatter'; -import { Bitmask, DataSelection, Deeptable } from './deepscatter'; - -// The type below indicates that a Qid is not valid if -// there are zero rows selected in the tile. - -// A Tix is a tile index, which is an integer identifier for a tile in quadtree. -// It uses the formula (4^z - 1) / 3 + y * 2^z + x, where z is the zoom level, -// and x and y are the tile coordinates. - -type Tix = number; - -// An Rix is a row index, which is an integer identifier for a row in a tile. -type Rix = number; - -// A Rixen is a list of row indices. It must be non-empty. -type Rixen = [Rix, ...Rix[]]; - -// A Qid is a pair of a Tix and a Rixen. It identifies a set of rows in a tile. -export type Qid = [Tix, Rixen]; -export type QidArray = Qid[]; - -export function zxyToTix(z: number, x: number, y: number) { - return (4 ** z - 1) / 3 + y * 2 ** z + x; -} - -function parentTix(tix: number) { - const [z, x, y] = tixToZxy(tix); - return zxyToTix(z - 1, Math.floor(x / 2), Math.floor(y / 2)); -} - -/** - * - * @param tix The numeric tile index - * @param deeptable The deepscatter dataset - * @returns The tile, if it exists. - * - */ -export async function tixToTile(tix: Tix, deeptable: Deeptable) { - if (tix === 0) { - return deeptable.root_tile; - } - if (isNaN(tix)) { - throw new Error('NaN tile index'); - } - // We need all parents to exist to find their children. So - // we fetch the tiles here to ensure they've loaded. - const parent = await tixToTile(parentTix(tix), deeptable); - // - await parent.populateManifest(); - // Now that the parents are loaded, we can find the child. - const [z, x, y] = tixToZxy(tix); - const key = `${z}/${x}/${y}`; - const t = deeptable.map((d) => d).filter((d) => d.key === key); - if (t.length) { - return t[0]; - } - throw new Error(`Tile ${key} not found in dataset.`); -} - -/** - * - * @param qid a quadtree id - * @param dataset - * @returns - */ -export async function qidToRowProxy(qid: Qid, dataset: Deeptable) { - const tile = await tixToTile(qid[0], dataset); - await tile.get_column('x'); - return tile.record_batch.get(qid[1][0]); -} - -export function tileKey_to_tix(key: string) { - const [z, x, y] = key.split('/').map((d) => parseInt(d)); - return zxyToTix(z, x, y); -} - -export function tixToZxy(tix: Tix): [number, number, number] { - // This is the inverse function that goes from a quadtree tile's integer identifier 'qix' to the [z, x, y] tuple. - - // The z level is the inverse of the qix function. - // Javascript doesn't have base-4 logarithm I guess, so we divide the natural log by the natural log of 4. - const z = Math.floor(Math.log(tix * 3 + 1) / Math.log(4)); - - // We then get the index inside the tile, which is the offset from the base sequence. - const blockPosition = tix - (4 ** z - 1) / 3; - - // Modulo operations turn this into x and y coordinates. - const x = blockPosition % 2 ** z; - const y = Math.floor(blockPosition / 2 ** z); - return [z, x, y]; -} - -/** - * - * @param row the row returned from a point event, etc. - * @param dataset a deepscatter dataset. - * @returns - */ -export function getQidFromRow( - row: StructRowProxy, - dataset: Deeptable, -): [number, number] { - const tile = getTileFromRow(row, dataset); - const rix = row[Symbol.for('rowIndex')] as number; - return [tileKey_to_tix(tile.key), rix] as [number, number]; -} - -export function getTileFromRow(row: StructRowProxy, dataset: Deeptable): Tile { - const ix = row.ix as bigint; - const rix = row[Symbol.for('rowIndex')] as number; - const matches = dataset - .map((tile) => tile) - .filter((t) => { - try { - t.record_batch; - } catch (err) { - return false; - } - const relatedRow = t.record_batch.get(rix); - if (relatedRow === null) { - return false; - } - return relatedRow.ix === ix; - }); - - if (matches.length === 0) { - throw new Error('No tiles found for this row.'); - } - return matches[0]; -} - -export function getQidArrayFromRows( - rows: StructRowProxy[], - dataset: Deeptable, -): QidArray { - // TODO: this is really inefficient. We should be able to do this in one pass. - const qids = rows.map((row) => getQidFromRow(row, dataset)); - const mapped = new Map(); - for (const qid of qids) { - if (mapped.has(qid[0])) { - mapped.get(qid[0]).push(qid[1]); - } else { - mapped.set(qid[0], [qid[1]]); - } - } - return Array.from(mapped.entries()); -} - -export function selectQixOnTile(tile: Tile, qidList: QidArray) { - const mask = new Bitmask(tile.record_batch.numRows); - const [z, x, y] = tile.key.split('/').map((d) => parseInt(d)); - const tix = zxyToTix(z, x, y); - const rixes = qidList - .filter((d) => d[0] === tix) - .map((d) => d[1]) - .flat(); - for (const rix of rixes) { - mask.set(rix); - } - return mask.to_arrow(); -} - -/** - * - * @param hoverDatum A struct row. - * @param selection A DataSelection - * @param deeptable A Deepscatter dataset - * @returns - */ -export async function isDatumInSelection( - hoverDatum: StructRowProxy, - selection: DataSelection | null, - deeptable: Deeptable, -): Promise { - if (!selection) return false; - const [tix, rix] = getQidFromRow(hoverDatum, deeptable); - const owningTile = await tixToTile(tix, deeptable); - const array = (await owningTile.get_column(selection.name)) as Vector; - return !!array.get(rix); -}