Switch sources to typed arrays instead of strings (#242)

* Store source files as Uint8Arrays instead of strings internally * Fix ASTNode.extractSourceFragment() to use typed array instead of string
Consensys · Jan 11, 2024 · c11d592 · c11d592
1 parent 9ebcad8
commit c11d592
Show file tree

Hide file tree

Showing 41 changed files with 986 additions and 217 deletions.
diff --git a/.eslintrc.json b/.eslintrc.json
@@ -30,6 +30,8 @@
     "ignorePatterns": [
         "src/compile/inference/file_level_definitions_parser_header.ts",
         "src/compile/inference/file_level_definitions_parser.ts",
+        "src/ast/comments/comments_parser_header.ts",
+        "src/ast/comments/comments_parser.ts",
         "test/utils/typeStrings/typeString_parser_header.ts",
         "test/utils/typeStrings/typeString_parser.ts"
     ]

diff --git a/.gitignore b/.gitignore
@@ -6,5 +6,6 @@ docs
 coverage
 *.tgz
 src/compile/inference/file_level_definitions_parser.ts
+src/ast/comments/comments_parser.ts
 test/utils/typeStrings/typeString_parser.ts
 .idea
diff --git a/.nycrc.json b/.nycrc.json
@@ -7,7 +7,8 @@
         "**/coverage/**",
         "**/docs/**",
         "**/.compiler_cache/**",
-        "src/compile/inference/file_level_definitions_parser*.ts"
+        "src/compile/inference/file_level_definitions_parser*.ts",
+        "src/ast/comments/comments_parser*.ts"
     ],
     "reporter": ["lcov", "text-summary"],
     "all": true,

diff --git a/package.json b/package.json
@@ -16,7 +16,8 @@
         "transpile": "tsc",
         "build-type-parser": "tspegjs -o test/utils/typeStrings/typeString_parser.ts --custom-header-file test/utils/typeStrings/typeString_parser_header.ts --cache test/utils/typeStrings/typeString_grammar.pegjs",
         "build-file-level-definitions-parser": "tspegjs -o src/compile/inference/file_level_definitions_parser.ts --custom-header-file src/compile/inference/file_level_definitions_parser_header.ts --cache src/compile/inference/file_level_definitions.pegjs",
-        "build": "npm run clean && npm run build-file-level-definitions-parser && npm run transpile && chmod u+x dist/bin/compile.js",
+        "build-comments-parser": "tspegjs -o src/ast/comments/comments_parser.ts --custom-header-file src/ast/comments/comments_parser_header.ts --cache src/ast/comments/comments_grammar.pegjs",
+        "build": "npm run clean && npm run build-comments-parser && npm run build-file-level-definitions-parser && npm run transpile && chmod u+x dist/bin/compile.js",
         "lint": "eslint src/ test/ --ext=ts",
         "lint:fix": "eslint src/ test/ --ext=ts --fix",
         "test": "npm run build-type-parser && NODE_OPTIONS='--max-old-space-size=2048' nyc mocha",

diff --git a/src/ast/ast_node.ts b/src/ast/ast_node.ts
@@ -1,6 +1,6 @@
 import { ASTNodeFormatter } from "./ast_node_formatter";
 import { ASTContext } from "./ast_reader";
-import { parseSourceLocation, SourceLocation } from "./utils";
+import { SourceLocation, parseSourceLocation } from "./utils";
 
 export type ASTNodeCallback = (node: ASTNode) => void;
 export type ASTNodeSelector = (node: ASTNode) => boolean;
@@ -328,10 +328,10 @@ export class ASTNode {
      *
      * In other words, returns corresponding code fragment substring.
      */
-    extractSourceFragment(source: string): string {
+    extractSourceFragment(source: Uint8Array): Uint8Array {
         const { offset, length } = this.sourceInfo;
 
-        return source.substr(offset, length);
+        return source.slice(offset, offset + length);
     }
 
     private createWalker(callback: ASTNodeCallback): ASTNodeCallback {

diff --git a/src/ast/ast_reader.ts b/src/ast/ast_reader.ts
@@ -5,6 +5,10 @@ import { ModernConfiguration } from "./modern";
 import { DefaultNodePostprocessorList } from "./postprocessing";
 import { sequence } from "./utils";
 
+// We store source files as byte arrays since AST src maps are byte-offset
+// based.
+export type FileMap = Map<string, Uint8Array>;
+
 export interface ASTNodeProcessor<T extends ASTNode> {
     process(
         reader: ASTReader,
@@ -14,7 +18,7 @@ export interface ASTNodeProcessor<T extends ASTNode> {
 }
 
 export interface ASTNodePostprocessor<T extends ASTNode> {
-    process(node: T, context: ASTContext, sources?: Map<string, string>): void;
+    process(node: T, context: ASTContext, sources?: FileMap): void;
     isSupportedNode(node: ASTNode): node is T;
 }
 
@@ -133,15 +137,15 @@ export class ASTPostprocessor {
         );
     }
 
-    processNode(node: ASTNode, context: ASTContext, sources?: Map<string, string>): void {
+    processNode(node: ASTNode, context: ASTContext, sources?: FileMap): void {
         const postprocessors = this.getPostprocessorsForNode(node);
 
         for (const postprocessor of postprocessors) {
             postprocessor.process(node, context, sources);
         }
     }
 
-    processContext(context: ASTContext, sources?: Map<string, string>): void {
+    processContext(context: ASTContext, sources?: FileMap): void {
         for (const postprocessor of this.nodePostprocessors) {
             for (const node of context.nodes) {
                 if (postprocessor.isSupportedNode(node)) {
@@ -185,7 +189,7 @@ export class ASTReader {
      *
      * @returns An array of `SourceUnit`s for each of the source entries in the input.
      */
-    read(data: any, kind = ASTKind.Any, sources?: Map<string, string>): SourceUnit[] {
+    read(data: any, kind = ASTKind.Any, sources?: FileMap): SourceUnit[] {
         const entries: Array<[string, any]> = Object.entries(data.sources);
         const rootNodeTypeName = "SourceUnit";
         const result: SourceUnit[] = [];

diff --git a/src/ast/comments/comment.ts b/src/ast/comments/comment.ts
@@ -0,0 +1,35 @@
+import { RawCommentKind } from "../constants";
+
+export interface CommentLoc {
+    start: number;
+    end: number;
+}
+
+export class RawComment {
+    /**
+     * Type of comment
+     */
+    kind: RawCommentKind;
+
+    /**
+     * The entire text of the comment include *s and /s
+     */
+    text: string;
+
+    /**
+     * The text of the comment without * and /. I.e. only the actual comment body
+     */
+    internalText: string;
+
+    /**
+     * The location of this comment
+     */
+    loc: CommentLoc;
+
+    constructor(kind: RawCommentKind, text: string, internalText: string, loc: CommentLoc) {
+        this.kind = kind;
+        this.text = text;
+        this.internalText = internalText;
+        this.loc = loc;
+    }
+}
diff --git a/src/ast/comments/comments_grammar.pegjs b/src/ast/comments/comments_grammar.pegjs
@@ -0,0 +1,168 @@
+{
+    expected;
+    error;
+    peg$anyExpectation;
+    peg$parse__;
+}
+
+CommentSoup =
+    t: (
+        ([^"'/]+ (!("//" / "///" / "/*") "/")?) { return text(); } // non-comment, non-string-literal anything
+        / StringLiteral { return text(); } // string literal
+        / (c: Comment __ { return c; }) // comment
+    )* { return t; }
+
+Comment
+    = BlockComment
+    / NatspecLineGroup
+    / LineComment
+
+
+FirstBlockLine = "/*" body: ((!"*/" NonLineTerminator)* { return text(); }) LineTerminator { return body; }
+BlockLine = (PrimitiveWhiteSpace* (!"*/" "*"))? body: ((!"*/" NonLineTerminator)* { return text(); }) LineTerminator { return body; }
+LastBlockLine = (PrimitiveWhiteSpace* (!"*/" "*"))? body: ((!"*/" NonLineTerminator)* { return text(); }) "*/" { return body; }
+
+MultiLineBlockComment = start: FirstBlockLine inner: BlockLine* last: LastBlockLine {
+    const isNatSpec = start[0] === "*";
+
+    // For NatSpec comments we strip 1 space from each inner line (if present)
+    // to be compatible with the Solidity compiler's behavior
+    if (isNatSpec) {
+        inner = inner.map((l: string) => l.startsWith(" ") ? l.slice(1) : l);
+        last = last.startsWith(" ") ? last.slice(1) : last;
+    }
+
+    let body = [start, ...inner, last].join("\n")
+
+    // for natspec skip the second *
+    body = isNatSpec ? body.slice(1) : body;
+
+    const kind = isNatSpec ? RawCommentKind.BlockNatSpec : RawCommentKind.BlockComment;
+
+    return new RawComment(kind, text(), body, mkLoc(location()) )
+}
+
+SingleLineBlockComment = "/*" body: ((!"*/" NonLineTerminator)* { return text(); }) "*/" {
+    const isNatSpec = body[0] === "*";
+    return new RawComment(
+        isNatSpec ? RawCommentKind.BlockNatSpec : RawCommentKind.BlockComment,
+        text(),
+        isNatSpec ? body.slice(1) : body, // for natspec skip the second *
+        mkLoc(location())
+    );
+}
+
+BlockComment = MultiLineBlockComment / SingleLineBlockComment
+
+NonLineTerminator =
+    [^\n\r\u2028\u2029]
+
+LineComment = 
+    "//" body: (NonLineTerminator* { return text(); }) LineTerminator {
+        return new RawComment(RawCommentKind.SingleLineComment, text(), body, mkLoc(location()));
+    }
+
+LineNatspec = 
+    PrimitiveWhiteSpace* "///" body: (NonLineTerminator* { return text(); }) LineTerminator {
+        return body.startsWith(" ") ? body.slice(1) : body;
+    }
+
+NatspecLineGroup =
+    bodies: LineNatspec+ {
+        return new RawComment(RawCommentKind.LineGroupNatSpec, text(), bodies.join("\n"), mkLoc(location()));
+    }
+
+// ==== White space
+
+PrimitiveWhiteSpace =
+    "\t"
+    / "\v"
+    / "\f"
+    / " "
+    / "\u00A0"
+    / "\uFEFF"
+    / Zs
+
+// Separator, Space
+Zs =
+    [\u0020\u00A0\u1680\u2000-\u200A\u202F\u205F\u3000]
+
+LineTerminator =
+    [\n\r\u2028\u2029]
+
+__ =
+    (PrimitiveWhiteSpace / LineTerminator)*
+
+StringLiteral =
+    "'" chars: SingleStringChar* "'" { return chars.join(""); }
+    / '"' chars: DoubleStringChar* '"' { return chars.join(""); }
+
+AnyChar =
+    .
+
+DoubleStringChar =
+    !('"' / "\\" / LineTerminator) AnyChar { return text(); }
+    / "\\" sequence: EscapeSequence { return sequence; }
+    / LineContinuation
+
+SingleStringChar =
+    !("'" / "\\" / LineTerminator) AnyChar { return text(); }
+    / "\\" sequence: EscapeSequence { return sequence; }
+    / LineContinuation
+
+LineContinuation =
+    "\\" LineTerminatorSequence { return ""; }
+
+EscapeSequence =
+    CharEscapeSequence
+    / "0" !DecDigit { return "\0"; }
+    / HexEscapeSequence
+    / UnicodeEscapeSequence
+    / AnyChar // Allow invalid hex sequences as a fallback
+
+CharEscapeSequence =
+    SingleEscapeChar
+    / NonEscapeChar
+
+SingleEscapeChar =
+    "'"
+    / '"'
+    / "\\"
+    / "b"  { return "\b"; }
+    / "f"  { return "\f"; }
+    / "n"  { return "\n"; }
+    / "r"  { return "\r"; }
+    / "t"  { return "\t"; }
+    / "v"  { return "\v"; }
+
+NonEscapeChar =
+    !(EscapeChar / LineTerminator) AnyChar { return text(); }
+
+HexDigit =
+    [0-9a-f]i
+
+DecDigit =
+    [0-9]
+
+EscapeChar =
+    SingleEscapeChar
+    / DecDigit
+    / "x"
+    / "u"
+
+HexEscapeSequence =
+    "x" digits:$(HexDigit HexDigit) {
+        return String.fromCharCode(parseInt(digits, 16));
+    }
+
+UnicodeEscapeSequence =
+    "u" digits:$(HexDigit HexDigit HexDigit HexDigit) {
+        return String.fromCharCode(parseInt(digits, 16));
+    }
+
+LineTerminatorSequence =
+    "\n"
+    / "\r\n"
+    / "\r"
+    / "\u2028"
+    / "\u2029"
diff --git a/src/ast/comments/comments_parser_header.ts b/src/ast/comments/comments_parser_header.ts
@@ -0,0 +1,11 @@
+import { CommentLoc, RawComment } from "./comment";
+import { RawCommentKind } from "../constants";
+
+function mkLoc(raw: any): CommentLoc {
+    return { start: raw.start.offset, end: raw.end.offset };
+}
+
+export function parseComments(contents: string): (RawComment | string)[] {
+    // @ts-ignore
+    return parse(contents);
+}
diff --git a/src/ast/comments/index.ts b/src/ast/comments/index.ts
@@ -0,0 +1,2 @@
+export * from "./comment";
+export { parseComments } from "./comments_parser";
diff --git a/src/ast/constants.ts b/src/ast/constants.ts
@@ -89,6 +89,13 @@ export enum TimeUnit {
     Years = "years"
 }
 
+export enum RawCommentKind {
+    SingleLineComment = "single_line",
+    BlockComment = "block_comment",
+    LineGroupNatSpec = "line_group_natspec",
+    BlockNatSpec = "block_natspec"
+}
+
 export const PossibleDataLocations = new Set<string>(Object.values(DataLocation));
 
 export const PossibleFunctionVisibilities = new Set<string>(Object.values(FunctionVisibility));

diff --git a/src/ast/index.ts b/src/ast/index.ts
@@ -15,3 +15,4 @@ export * from "./dispatch";
 export * from "./definitions";
 export * from "./utils";
 export * from "./xpath";
+export * from "./comments";
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		export * from "./comment";
		export { parseComments } from "./comments_parser";