From 56bfad94f9203c64e21403703aee074554ec32cc Mon Sep 17 00:00:00 2001 From: Dimitar Bounov Date: Thu, 18 Jan 2024 10:53:47 +0200 Subject: [PATCH 1/2] Rename unicode helpers to make them less confusing --- ...structured_documentation_reconstruction.ts | 8 +-- src/ast/writing/writer.ts | 4 +- src/bin/compile.ts | 4 +- src/compile/compiler_selection.ts | 4 +- src/compile/inference/imports.ts | 4 +- src/compile/input.ts | 4 +- src/compile/utils.ts | 8 +-- src/misc/unicode.ts | 66 ++++++++++++++++--- 8 files changed, 76 insertions(+), 26 deletions(-) diff --git a/src/ast/postprocessing/structured_documentation_reconstruction.ts b/src/ast/postprocessing/structured_documentation_reconstruction.ts index 06c17f65..93509f07 100644 --- a/src/ast/postprocessing/structured_documentation_reconstruction.ts +++ b/src/ast/postprocessing/structured_documentation_reconstruction.ts @@ -1,4 +1,4 @@ -import { strByteLen, toUTF8 } from "../../misc"; +import { bytesToString, strUTF8Len } from "../../misc"; import { ASTNode } from "../ast_node"; import { ASTContext, ASTNodePostprocessor, FileMap } from "../ast_reader"; import { RawComment, parseComments } from "../comments"; @@ -31,7 +31,7 @@ export class StructuredDocumentationReconstructor { source: Uint8Array ): StructuredDocumentation | undefined { const [from, to, sourceIndex] = coords; - const fragment = toUTF8(source.slice(from, to)); + const fragment = bytesToString(source.slice(from, to)); const parsedCommentsSoup = parseComments(fragment); @@ -66,9 +66,9 @@ export class StructuredDocumentationReconstructor { return undefined; } - const byteOffsetFromFragment = strByteLen(fragment.slice(0, lastComment.loc.start)); + const byteOffsetFromFragment = strUTF8Len(fragment.slice(0, lastComment.loc.start)); const offset = from + byteOffsetFromFragment; - const length = strByteLen(lastComment.text); + const length = strUTF8Len(lastComment.text); const src = `${offset}:${length}:${sourceIndex}`; return new StructuredDocumentation(0, src, lastComment.internalText.trim()); diff --git a/src/ast/writing/writer.ts b/src/ast/writing/writer.ts index 0e4a92d9..94c31854 100644 --- a/src/ast/writing/writer.ts +++ b/src/ast/writing/writer.ts @@ -1,4 +1,4 @@ -import { strByteLen } from "../../misc"; +import { strUTF8Len } from "../../misc"; import { ASTNode, ASTNodeConstructor } from "../ast_node"; import { YulNode } from "../implementation/statement/inline_assembly"; import { SourceFormatter } from "./formatter"; @@ -117,7 +117,7 @@ export class ASTWriter { for (const element of current) { if (typeof element === "string") { source += element; - size += strByteLen(element); + size += strUTF8Len(element); } else { const [node, nodeDesc] = element; const start = size; diff --git a/src/bin/compile.ts b/src/bin/compile.ts index 7c018f55..ea9c6d20 100644 --- a/src/bin/compile.ts +++ b/src/bin/compile.ts @@ -7,6 +7,7 @@ import { ASTNodeFormatter, ASTReader, ASTWriter, + bytesToString, CACHE_DIR, CompilationOutput, CompileFailedError, @@ -32,7 +33,6 @@ import { PrettyFormatter, SourceUnit, StateVariableVisibility, - toUTF8, VariableDeclaration, XPath } from ".."; @@ -326,7 +326,7 @@ function error(message: string): never { data.sources[key] = {}; } - data.sources[key].source = toUTF8(value); + data.sources[key].source = bytesToString(value); } } diff --git a/src/compile/compiler_selection.ts b/src/compile/compiler_selection.ts index 1c8d0884..bce9df58 100644 --- a/src/compile/compiler_selection.ts +++ b/src/compile/compiler_selection.ts @@ -1,4 +1,4 @@ -import { toUTF8 } from "../misc"; +import { bytesToString } from "../misc"; import { CompilerSeries, CompilerVersions } from "./constants"; import { extractSpecifiersFromSource, getCompilerVersionsBySpecifiers } from "./version"; @@ -80,7 +80,7 @@ export class VersionDetectionStrategy implements CompilerVersionSelectionStrateg fallback: CompilerVersionSelectionStrategy, descending = true ) { - this.sources = sources.map(toUTF8); + this.sources = sources.map(bytesToString); this.fallback = fallback; this.descending = descending; } diff --git a/src/compile/inference/imports.ts b/src/compile/inference/imports.ts index 6919dc1c..1fa71f45 100644 --- a/src/compile/inference/imports.ts +++ b/src/compile/inference/imports.ts @@ -1,7 +1,7 @@ import fse from "fs-extra"; import { dirname, normalize } from "path"; import { CompileInferenceError, ImportResolver, Remapping } from ".."; -import { FileMap, assert, toUTF8 } from "../.."; +import { FileMap, assert, bytesToString } from "../.."; import { AnyFileLevelNode, FileLevelNodeKind, @@ -161,7 +161,7 @@ export async function findAllFiles( let flds: AnyFileLevelNode[]; try { - flds = parseFileLevelDefinitions(toUTF8(content)); + flds = parseFileLevelDefinitions(bytesToString(content)); } catch (e: any) { if (e instanceof PeggySyntaxError) { const start = e.location.start.offset; diff --git a/src/compile/input.ts b/src/compile/input.ts index 743d13c5..3608b2ff 100644 --- a/src/compile/input.ts +++ b/src/compile/input.ts @@ -1,5 +1,5 @@ import { FileMap } from "../ast"; -import { toUTF8 } from "../misc"; +import { bytesToString } from "../misc"; import { CompilationOutput } from "./constants"; export interface PartialSolcInput { @@ -81,7 +81,7 @@ export function createCompilerInput( partialInp.sources = {}; for (const [fileName, content] of files.entries()) { - partialInp.sources[fileName] = { content: toUTF8(content) }; + partialInp.sources[fileName] = { content: bytesToString(content) }; } const input = partialInp as SolcInput; diff --git a/src/compile/utils.ts b/src/compile/utils.ts index 1ef02eee..ca2cb1c4 100644 --- a/src/compile/utils.ts +++ b/src/compile/utils.ts @@ -1,7 +1,7 @@ import fse from "fs-extra"; import path from "path"; import { FileSystemResolver, getCompilerForVersion, LocalNpmResolver } from "."; -import { assert, fromUTF8 } from "../misc"; +import { assert, stringToBytes } from "../misc"; import { CompilerVersionSelectionStrategy, LatestVersionInEachSeriesStrategy, @@ -114,7 +114,7 @@ export function parsePathRemapping(remapping: string[]): Remapping[] { function fillFilesFromSources(files: FileMap, sources: { [fileName: string]: any }): void { for (const [fileName, section] of Object.entries(sources)) { if (section && typeof section.source === "string") { - files.set(fileName, fromUTF8(section.source)); + files.set(fileName, stringToBytes(section.source)); } } } @@ -209,7 +209,7 @@ export async function compileSourceString( const resolvers = [fsResolver, npmResolver]; const parsedRemapping = parsePathRemapping(remapping); - const files = new Map([[fileName, fromUTF8(sourceCode)]]); + const files = new Map([[fileName, stringToBytes(sourceCode)]]); const resolvedFileNames = new Map([[fileName, fileName]]); await findAllFiles(files, resolvedFileNames, parsedRemapping, resolvers); @@ -386,7 +386,7 @@ export async function compileJsonData( if (consistentlyContainsOneOf(sources, "source")) { for (const [fileName, fileData] of Object.entries<{ source: string }>(sources)) { - files.set(fileName, fromUTF8(fileData.source)); + files.set(fileName, stringToBytes(fileData.source)); } const compilerVersionStrategy = getCompilerVersionStrategy([...files.values()], version); diff --git a/src/misc/unicode.ts b/src/misc/unicode.ts index 5c21133b..82df82d7 100644 --- a/src/misc/unicode.ts +++ b/src/misc/unicode.ts @@ -1,14 +1,64 @@ -const decoder = new TextDecoder(); -const encoder = new TextEncoder(); +const utf8Enc = new TextEncoder(); +const utf8Dec = new TextDecoder(); +const scratch = new Uint8Array(4); -export function toUTF8(buf: Uint8Array): string { - return decoder.decode(buf); +/** + * Convert a UTF-8 encoded bytes into a JS UTF-16 string + */ +export function bytesToString(buf: Uint8Array): string { + return utf8Dec.decode(buf); } -export function fromUTF8(str: string): Uint8Array { - return encoder.encode(str); +/** + * Convert JS UTF-16 string into UTF-8 encoded bytes + */ +export function stringToBytes(str: string): Uint8Array { + return utf8Enc.encode(str); } -export function strByteLen(str: string): number { - return fromUTF8(str).length; +/** + * Compute the length of a JS string when encoded as UTF-8 bytes + */ +export function strUTF8Len(s: string): number { + let len = 0; + for (const ch of s) { + len += utf8Enc.encodeInto(ch, scratch).written; + } + + return len; +} + +/** + * Given a JS string `s` and an index `idx` of a character in it, compute the + * corresponding byte offset of the character in the UTF-8 encoding of the + * string. + */ +export function strUTF16IndexToUTF8Offset(s: string, idx: number): number { + let i = 0, + off = 0; + + for (const ch of s) { + if (i === idx) { + return off; + } + + const charBytes = utf8Enc.encodeInto(ch, scratch).written; + + i += charBytes <= 2 ? 1 : 2; + off += charBytes; + + if (i === idx) { + return off; + } + + if (i >= idx) { + throw new Error(`No unicode character index ${idx} in string ${s}.`); + } + } + + if (i === idx) { + return off; + } + + throw new Error(`No unicode character index ${idx} in string ${s}.`); } From ea335e27f46cf67758bc2cbdb67669ff43123916 Mon Sep 17 00:00:00 2001 From: Dimitar Bounov Date: Thu, 18 Jan 2024 11:03:03 +0200 Subject: [PATCH 2/2] fix tests --- test/unit/ast/ast_node.spec.ts | 6 +++--- test/unit/ast/unicode.spec.ts | 8 ++++---- test/unit/compile/compiler_selection.spec.ts | 4 ++-- test/unit/compile/inference/findAllFiles.spec.ts | 6 +++--- test/unit/compile/utils.spec.ts | 9 ++++++--- 5 files changed, 18 insertions(+), 15 deletions(-) diff --git a/test/unit/ast/ast_node.spec.ts b/test/unit/ast/ast_node.spec.ts index fd8753cc..589de54b 100644 --- a/test/unit/ast/ast_node.spec.ts +++ b/test/unit/ast/ast_node.spec.ts @@ -5,10 +5,10 @@ import { ASTReader, Block, compileJson, - fromUTF8, FunctionDefinition, Literal, - SourceUnit + SourceUnit, + stringToBytes } from "../../../src"; describe("ASTNode", () => { @@ -102,7 +102,7 @@ describe("ASTNode", () => { it("extractSourceFragment()", () => { const increment = nodes[nodes.length - 2]; - expect(increment.extractSourceFragment(source)).toEqual(fromUTF8("a++")); + expect(increment.extractSourceFragment(source)).toEqual(stringToBytes("a++")); }); }); } diff --git a/test/unit/ast/unicode.spec.ts b/test/unit/ast/unicode.spec.ts index d9ed190b..83c7e473 100644 --- a/test/unit/ast/unicode.spec.ts +++ b/test/unit/ast/unicode.spec.ts @@ -13,11 +13,11 @@ import { SrcRangeMap, StructuredDocumentation, assert, + bytesToString, compileSol, compileSourceString, detectCompileErrors, - fromUTF8, - toUTF8 + stringToBytes } from "../../../src"; const samples: string[] = [ @@ -30,7 +30,7 @@ async function strToAst( contents: string, version: string ): Promise<[SourceUnit, ASTReader]> { - const sources: FileMap = new Map([[name, fromUTF8(contents)]]); + const sources: FileMap = new Map([[name, stringToBytes(contents)]]); const canonicalResult = await compileSourceString(name, contents, version); const errors = detectCompileErrors(canonicalResult.data); @@ -85,7 +85,7 @@ describe("Unicode tests", () => { for (const doc of docs) { const coords = doc.sourceInfo; - const actual = toUTF8( + const actual = bytesToString( contents.slice(coords.offset, coords.offset + coords.length) ).trim(); diff --git a/test/unit/compile/compiler_selection.spec.ts b/test/unit/compile/compiler_selection.spec.ts index 45308e57..3185649f 100644 --- a/test/unit/compile/compiler_selection.spec.ts +++ b/test/unit/compile/compiler_selection.spec.ts @@ -7,11 +7,11 @@ import { CompilerVersions07, CompilerVersions08, CompilerVersionSelectionStrategy, - fromUTF8, LatestAndFirstVersionInEachSeriesStrategy, LatestCompilerVersion, LatestVersionInEachSeriesStrategy, RangeVersionStrategy, + stringToBytes, VersionDetectionStrategy } from "../../../src"; @@ -180,7 +180,7 @@ describe("VersionDetectionStrategy", () => { it(`Returns ${JSON.stringify(range)} for ${JSON.stringify(source)} and ${ fallback.constructor.name } in constructor`, () => { - const strategy = new VersionDetectionStrategy([fromUTF8(source)], fallback); + const strategy = new VersionDetectionStrategy([stringToBytes(source)], fallback); expect(strategy.select()).toEqual(range); }); diff --git a/test/unit/compile/inference/findAllFiles.spec.ts b/test/unit/compile/inference/findAllFiles.spec.ts index b7b34803..5b8130af 100644 --- a/test/unit/compile/inference/findAllFiles.spec.ts +++ b/test/unit/compile/inference/findAllFiles.spec.ts @@ -1,7 +1,7 @@ import expect from "expect"; import fse from "fs-extra"; import { join } from "path"; -import { FileMap, FileSystemResolver, findAllFiles, fromUTF8 } from "../../../../src"; +import { FileMap, FileSystemResolver, findAllFiles, stringToBytes } from "../../../../src"; const SAMPLES_DIR = join("test", "samples", "solidity"); @@ -63,7 +63,7 @@ describe("findAllFiles() throws proper errors", () => { const files: FileMap = new Map([ [ "foo.sol", - fromUTF8(`import a + stringToBytes(`import a contract Foo { } `) @@ -79,7 +79,7 @@ contract Foo { const files: FileMap = new Map([ [ "foo.sol", - fromUTF8(`import "a.sol"; + stringToBytes(`import "a.sol"; contract Foo { } `) diff --git a/test/unit/compile/utils.spec.ts b/test/unit/compile/utils.spec.ts index 2216f93d..6195f22f 100644 --- a/test/unit/compile/utils.spec.ts +++ b/test/unit/compile/utils.spec.ts @@ -5,12 +5,12 @@ import { CompilerKind, detectCompileErrors, FileMap, - fromUTF8, getCompilerForVersion, LatestAndFirstVersionInEachSeriesStrategy, LatestCompilerVersion, NativeCompiler, parsePathRemapping, + stringToBytes, WasmCompiler } from "../../../src"; @@ -102,9 +102,12 @@ describe("Compile general utils", () => { const expectedFiles: FileMap = new Map([ [ "./test/sol_files/json_code/B.sol", - fromUTF8("import './A.sol';\n\ncontract B {\n int16 test;\n}\n") + stringToBytes("import './A.sol';\n\ncontract B {\n int16 test;\n}\n") ], - ["./test/sol_files/json_code/A.sol", fromUTF8("contract A {\n uint8 test;\n}\n")] + [ + "./test/sol_files/json_code/A.sol", + stringToBytes("contract A {\n uint8 test;\n}\n") + ] ]); const cases: Array<[string, string | undefined, RegExp | undefined]> = [