feat[tool]: add node_id map to source map (#3811)

this commit adds a new, AST-based map to the source map which links program counters (pcs) directly back to the AST output. this should improve the ability of third parties to implement source code integrations (debuggers, storage map tracers, etc). refactors: - get rid of `vyper.codegen.core.getpos()` - rename `IRnode.source_pos` to `IRnode.ast_source` - refactor a couple places in codegen which were passing `IRnode`s to the `Expr` constructor - rewrote the source map compression routine a bit. it might have gotten broken but at this point the compressed source map does not seem widely used.
vyperlang · Mar 12, 2024 · 246f4a7 · 246f4a7
1 parent 39027dc
commit 246f4a7
Show file tree

Hide file tree

Showing 17 changed files with 239 additions and 171 deletions.
diff --git a/docs/compiling-a-contract.rst b/docs/compiling-a-contract.rst
@@ -275,11 +275,14 @@ The following example describes the expected input format of ``vyper-json``. Com
             //    evm.bytecode.opcodes - Opcodes list
             //    evm.deployedBytecode.object - Deployed bytecode object
             //    evm.deployedBytecode.opcodes - Deployed opcodes list
-            //    evm.deployedBytecode.sourceMap - Deployed source mapping (useful for debugging)
+            //    evm.deployedBytecode.sourceMap - Solidity-style source mapping
+            //    evm.deployedBytecode.sourceMapFull - Deployed source mapping (useful for debugging)
             //    evm.methodIdentifiers - The list of function hashes
             //
             // Using `evm`, `evm.bytecode`, etc. will select every target part of that output.
             // Additionally, `*` can be used as a wildcard to request everything.
+            // Note that the sourceMapFull.pc_ast_map is the recommended source map to use;
+            // the other types are included for legacy and compatibility reasons.
             //
             "outputSelection": {
                 "*": ["evm.bytecode", "abi"],  // Enable the abi and bytecode outputs for every single contract

diff --git a/tests/unit/cli/vyper_json/test_compile_json.py b/tests/unit/cli/vyper_json/test_compile_json.py
@@ -151,7 +151,11 @@ def test_compile_json(input_json, input_bundle):
     for source_id, contract_name in [(0, "foo"), (2, "library"), (3, "bar")]:
         path = f"contracts/{contract_name}.vy"
         data = compile_code_results[path]
-        assert output_json["sources"][path] == {"id": source_id, "ast": data["ast_dict"]["ast"]}
+        assert output_json["sources"][path] == {
+            "id": source_id,
+            "ast": data["ast_dict"]["ast"],
+            "annotated_ast": data["annotated_ast_dict"]["ast"],
+        }
         assert output_json["contracts"][path][contract_name] == {
             "abi": data["abi"],
             "devdoc": data["devdoc"],
@@ -260,15 +264,25 @@ def test_exc_handler_to_dict_compiler(input_json):
 
 
 def test_source_ids_increment(input_json):
-    input_json["settings"]["outputSelection"] = {"*": ["evm.deployedBytecode.sourceMap"]}
+    input_json["settings"]["outputSelection"] = {"*": ["ast", "evm.deployedBytecode.sourceMapFull"]}
     result = compile_json(input_json)
 
     def get(filename, contractname):
-        return result["contracts"][filename][contractname]["evm"]["deployedBytecode"]["sourceMap"]
+        ast = result["sources"][filename]["ast"]
+        ret = ast["source_id"]
+
+        # grab it via source map to sanity check
+        contract_info = result["contracts"][filename][contractname]["evm"]
+        pc_ast_map = contract_info["deployedBytecode"]["sourceMapFull"]["pc_ast_map"]
+        pc_item = next(iter(pc_ast_map.values()))
+        source_id, node_id = pc_item
+        assert ret == source_id
+
+        return ret
 
-    assert get("contracts/foo.vy", "foo").startswith("-1:-1:0")
-    assert get("contracts/library.vy", "library").startswith("-1:-1:2")
-    assert get("contracts/bar.vy", "bar").startswith("-1:-1:3")
+    assert get("contracts/foo.vy", "foo") == 0
+    assert get("contracts/library.vy", "library") == 2
+    assert get("contracts/bar.vy", "bar") == 3
 
 
 def test_relative_import_paths(input_json):

diff --git a/tests/unit/cli/vyper_json/test_output_selection.py b/tests/unit/cli/vyper_json/test_output_selection.py
@@ -45,6 +45,16 @@ def test_star():
     assert result == {PurePath("foo.vy"): expected, PurePath("bar.vy"): expected}
 
 
+def test_ast():
+    input_json = {
+        "sources": {"foo.vy": ""},
+        "settings": {"outputSelection": {"foo.vy": ["ast", "annotated_ast"]}},
+    }
+    expected = sorted([TRANSLATE_MAP[k] for k in ["ast", "annotated_ast"]])
+    result = get_output_formats(input_json)
+    assert result == {PurePath("foo.vy"): expected}
+
+
 def test_evm():
     input_json = {
         "sources": {"foo.vy": ""},

diff --git a/tests/unit/compiler/test_source_map.py b/tests/unit/compiler/test_source_map.py
@@ -1,14 +1,18 @@
+from collections import namedtuple
+
 from vyper.compiler import compile_code
 from vyper.compiler.output import _compress_source_map
 from vyper.compiler.utils import expand_source_map
 
 TEST_CODE = """
+x: public(uint256)
+
 @internal
 def _baz(a: int128) -> int128:
     b: int128 = a
     for i: int128 in range(2, 5):
         b *=  i
-        if b > 31337:
+        if b > 31336 + 1:
             break
     return b
 
@@ -82,26 +86,58 @@ def update_foo():
 
 
 def test_compress_source_map():
-    code = """
-@external
-def foo() -> uint256:
-    return 42
-    """
+    # mock the required VyperNode fields in compress_source_map
+    # fake_node = namedtuple("fake_node", ("lineno", "col_offset", "end_lineno", "end_col_offset"))
+    fake_node = namedtuple("fake_node", ["src"])
+
     compressed = _compress_source_map(
-        code, {"0": None, "2": (2, 0, 4, 13), "3": (2, 0, 2, 8), "5": (2, 0, 2, 8)}, {"3": "o"}, 2
+        {2: fake_node("-1:-1:-1"), 3: fake_node("1:45"), 5: fake_node("45:49")}, {3: "o"}, 6
     )
-    assert compressed == "-1:-1:2:-;1:45;:8::o;"
+    assert compressed == "-1:-1:-1;-1:-1:-1;-1:-1:-1;1:45:o;-1:-1:-1;45:49"
 
 
 def test_expand_source_map():
-    compressed = "-1:-1:0:-;;13:42:1;:21;::0:o;:::-;1::1;"
+    compressed = "13:42:1;:21;::0:o;:::-;1::1;"
     expanded = [
-        [-1, -1, 0, "-"],
-        [-1, -1, 0, None],
         [13, 42, 1, None],
         [13, 21, 1, None],
         [13, 21, 0, "o"],
         [13, 21, 0, "-"],
         [1, 21, 1, None],
     ]
     assert expand_source_map(compressed) == expanded
+
+
+def _construct_node_id_map(ast_struct):
+    if isinstance(ast_struct, dict):
+        ret = {}
+        if "node_id" in ast_struct:
+            ret[ast_struct["node_id"]] = ast_struct
+        for item in ast_struct.values():
+            ret.update(_construct_node_id_map(item))
+        return ret
+
+    elif isinstance(ast_struct, list):
+        ret = {}
+        for item in ast_struct:
+            ret.update(_construct_node_id_map(item))
+        return ret
+
+    else:
+        return {}
+
+
+def test_node_id_map():
+    code = TEST_CODE
+    out = compile_code(code, output_formats=["annotated_ast_dict", "source_map", "ir"])
+    assert out["source_map"]["pc_ast_map_item_keys"] == ("source_id", "node_id")
+
+    pc_ast_map = out["source_map"]["pc_ast_map"]
+
+    ast_node_map = _construct_node_id_map(out["annotated_ast_dict"])
+
+    for pc, (source_id, node_id) in pc_ast_map.items():
+        assert isinstance(pc, int), pc
+        assert isinstance(source_id, int), source_id
+        assert isinstance(node_id, int), node_id
+        assert node_id in ast_node_map
diff --git a/vyper/ast/nodes.py b/vyper/ast/nodes.py
@@ -146,7 +146,7 @@ def _to_node(obj, parent):
     if isinstance(obj, VyperNode):
         # if object is already a vyper node, make sure the parent is set correctly
         # and fix any missing source offsets
-        obj._parent = parent
+        obj.set_parent(parent)
         for field_name in NODE_SRC_ATTRIBUTES:
             if getattr(obj, field_name) is None:
                 setattr(obj, field_name, getattr(parent, field_name, None))

diff --git a/vyper/ast/nodes.pyi b/vyper/ast/nodes.pyi
@@ -17,6 +17,10 @@ def get_node(
 class VyperNode:
     full_source_code: str = ...
     node_source_code: str = ...
+    lineno: int = ...
+    col_offset: int = ...
+    end_lineno: int = ...
+    end_col_offset: int = ...
     _metadata: dict = ...
     _original_node: Optional[VyperNode] = ...
     def __init__(self, parent: Optional[VyperNode] = ..., **kwargs: Any) -> None: ...

diff --git a/vyper/builtins/_utils.py b/vyper/builtins/_utils.py
@@ -7,10 +7,10 @@
 from vyper.semantics.types.module import ModuleT
 
 
-def _strip_source_pos(ir_node):
-    ir_node.source_pos = None
+def _strip_ast_source(ir_node):
+    ir_node.ast_source = None
     for x in ir_node.args:
-        _strip_source_pos(x)
+        _strip_ast_source(x)
 
 
 def generate_inline_function(code, variables, variables_2, memory_allocator):
@@ -38,5 +38,5 @@ def generate_inline_function(code, variables, variables_2, memory_allocator):
     # NOTE if we ever use this for inlining user-code, it would make
     # sense to fix the offsets of the source positions in the generated
     # code instead of stripping them.
-    _strip_source_pos(generated_ir)
+    _strip_ast_source(generated_ir)
     return new_context, generated_ir
diff --git a/vyper/cli/vyper_json.py b/vyper/cli/vyper_json.py
@@ -17,6 +17,7 @@
 TRANSLATE_MAP = {
     "abi": "abi",
     "ast": "ast_dict",
+    "annotated_ast": "annotated_ast_dict",
     "devdoc": "devdoc",
     "evm.methodIdentifiers": "method_identifiers",
     "evm.bytecode.object": "bytecode",
@@ -313,8 +314,12 @@ def format_to_output_dict(compiler_data: dict) -> dict:
     for path, data in compiler_data.items():
         path = str(path)  # Path breaks json serializability
         output_dict["sources"][path] = {"id": data["source_id"]}
-        if "ast_dict" in data:
-            output_dict["sources"][path]["ast"] = data["ast_dict"]["ast"]
+
+        for k in ("ast_dict", "annotated_ast_dict"):
+            if k in data:
+                # un-translate the key
+                k2 = k.removesuffix("_dict")
+                output_dict["sources"][path][k2] = data[k]["ast"]
 
         name = PurePath(path).stem
         output_dict["contracts"][path] = {name: {}}

diff --git a/vyper/codegen/core.py b/vyper/codegen/core.py
@@ -432,15 +432,6 @@ def pop_dyn_array(darray_node, return_popped_item):
             return IRnode.from_list(b1.resolve(b2.resolve(ret)), typ=typ, location=location)
 
 
-def getpos(node):
-    return (
-        node.lineno,
-        node.col_offset,
-        getattr(node, "end_lineno", None),
-        getattr(node, "end_col_offset", None),
-    )
-
-
 # add an offset to a pointer, keeping location and encoding info
 def add_ofst(ptr, ofst):
     ret = ["add", ptr, ofst]

diff --git a/vyper/codegen/expr.py b/vyper/codegen/expr.py
@@ -13,7 +13,6 @@
     ensure_in_memory,
     get_dyn_array_count,
     get_element_ptr,
-    getpos,
     is_array_like,
     is_bytes_m_type,
     is_flag_type,
@@ -72,13 +71,6 @@ class Expr:
     # TODO: Once other refactors are made reevaluate all inline imports
 
     def __init__(self, node, context, is_stmt=False):
-        if isinstance(node, IRnode):
-            # this is a kludge for parse_AugAssign to pass in IRnodes
-            # directly.
-            # TODO fixme!
-            self.ir_node = node
-            return
-
         assert isinstance(node, vy_ast.VyperNode)
         if node.has_folded_value:
             node = node.get_folded_value()
@@ -94,7 +86,7 @@ def __init__(self, node, context, is_stmt=False):
             assert isinstance(self.ir_node, IRnode), self.ir_node
 
         self.ir_node.annotation = self.expr.get("node_source_code")
-        self.ir_node.source_pos = getpos(self.expr)
+        self.ir_node.ast_source = self.expr
 
     def parse_Int(self):
         typ = self.expr._metadata["type"]
@@ -382,7 +374,14 @@ def parse_BinOp(self):
         left = Expr.parse_value_expr(self.expr.left, self.context)
         right = Expr.parse_value_expr(self.expr.right, self.context)
 
-        is_shift_op = isinstance(self.expr.op, (vy_ast.LShift, vy_ast.RShift))
+        return Expr.handle_binop(self.expr.op, left, right, self.context)
+
+    @classmethod
+    def handle_binop(cls, op, left, right, context):
+        assert not left.is_pointer
+        assert not right.is_pointer
+
+        is_shift_op = isinstance(op, (vy_ast.LShift, vy_ast.RShift))
 
         if is_shift_op:
             assert is_numeric_type(left.typ)
@@ -391,25 +390,25 @@ def parse_BinOp(self):
             # Sanity check - ensure that we aren't dealing with different types
             # This should be unreachable due to the type check pass
             if left.typ != right.typ:
-                raise TypeCheckFailure(f"unreachable, {left.typ} != {right.typ}", self.expr)
+                raise TypeCheckFailure(f"unreachable: {left.typ} != {right.typ}")
             assert is_numeric_type(left.typ) or is_flag_type(left.typ)
 
         out_typ = left.typ
 
-        if isinstance(self.expr.op, vy_ast.BitAnd):
+        if isinstance(op, vy_ast.BitAnd):
             return IRnode.from_list(["and", left, right], typ=out_typ)
-        if isinstance(self.expr.op, vy_ast.BitOr):
+        if isinstance(op, vy_ast.BitOr):
             return IRnode.from_list(["or", left, right], typ=out_typ)
-        if isinstance(self.expr.op, vy_ast.BitXor):
+        if isinstance(op, vy_ast.BitXor):
             return IRnode.from_list(["xor", left, right], typ=out_typ)
 
-        if isinstance(self.expr.op, vy_ast.LShift):
+        if isinstance(op, vy_ast.LShift):
             new_typ = left.typ
             if new_typ.bits != 256:
                 # TODO implement me. ["and", 2**bits - 1, shl(right, left)]
                 raise TypeCheckFailure("unreachable")
             return IRnode.from_list(shl(right, left), typ=new_typ)
-        if isinstance(self.expr.op, vy_ast.RShift):
+        if isinstance(op, vy_ast.RShift):
             new_typ = left.typ
             if new_typ.bits != 256:
                 # TODO implement me. promote_signed_int(op(right, left), bits)
@@ -421,17 +420,17 @@ def parse_BinOp(self):
         assert is_numeric_type(left.typ)
 
         with left.cache_when_complex("x") as (b1, x), right.cache_when_complex("y") as (b2, y):
-            if isinstance(self.expr.op, vy_ast.Add):
+            if isinstance(op, vy_ast.Add):
                 ret = arithmetic.safe_add(x, y)
-            elif isinstance(self.expr.op, vy_ast.Sub):
+            elif isinstance(op, vy_ast.Sub):
                 ret = arithmetic.safe_sub(x, y)
-            elif isinstance(self.expr.op, vy_ast.Mult):
+            elif isinstance(op, vy_ast.Mult):
                 ret = arithmetic.safe_mul(x, y)
-            elif isinstance(self.expr.op, (vy_ast.Div, vy_ast.FloorDiv)):
+            elif isinstance(op, (vy_ast.Div, vy_ast.FloorDiv)):
                 ret = arithmetic.safe_div(x, y)
-            elif isinstance(self.expr.op, vy_ast.Mod):
+            elif isinstance(op, vy_ast.Mod):
                 ret = arithmetic.safe_mod(x, y)
-            elif isinstance(self.expr.op, vy_ast.Pow):
+            elif isinstance(op, vy_ast.Pow):
                 ret = arithmetic.safe_pow(x, y)
             else:  # pragma: nocover
                 raise CompilerPanic("Unreachable")

diff --git a/vyper/codegen/function_definitions/external_function.py b/vyper/codegen/function_definitions/external_function.py
@@ -1,6 +1,6 @@
 from vyper.codegen.abi_encoder import abi_encoding_matches_vyper
 from vyper.codegen.context import Context, VariableRecord
-from vyper.codegen.core import get_element_ptr, getpos, make_setter, needs_clamp
+from vyper.codegen.core import get_element_ptr, make_setter, needs_clamp
 from vyper.codegen.expr import Expr
 from vyper.codegen.function_definitions.common import (
     EntryPointInfo,
@@ -39,7 +39,7 @@ def _register_function_args(func_t: ContractFunctionT, context: Context) -> list
             dst = IRnode(p, typ=arg.typ, location=MEMORY)
 
             copy_arg = make_setter(dst, arg_ir)
-            copy_arg.source_pos = getpos(arg.ast_source)
+            copy_arg.ast_source = arg.ast_source
             ret.append(copy_arg)
         else:
             assert abi_encoding_matches_vyper(arg.typ)
@@ -101,18 +101,18 @@ def handler_for(calldata_kwargs, default_kwargs):
             rhs = get_element_ptr(calldata_kwargs_ofst, k, array_bounds_check=False)
 
             copy_arg = make_setter(lhs, rhs)
-            copy_arg.source_pos = getpos(arg_meta.ast_source)
+            copy_arg.ast_source = arg_meta.ast_source
             ret.append(copy_arg)
 
         for x in default_kwargs:
             dst = context.lookup_var(x.name).pos
             lhs = IRnode(dst, location=MEMORY, typ=x.typ)
-            lhs.source_pos = getpos(x.ast_source)
+            lhs.ast_source = x.ast_source
             kw_ast_val = func_t.default_values[x.name]  # e.g. `3` in x: int = 3
             rhs = Expr(kw_ast_val, context).ir_node
 
             copy_arg = make_setter(lhs, rhs)
-            copy_arg.source_pos = getpos(x.ast_source)
+            copy_arg.ast_source = x.ast_source
             ret.append(copy_arg)
 
         ret.append(["goto", func_t._ir_info.external_function_base_entry_label])
@@ -210,7 +210,7 @@ def generate_ir_for_external_function(code, compilation_target):
 
     # the ir which comprises the main body of the function,
     # besides any kwarg handling
-    func_common_ir = IRnode.from_list(["seq", body, exit_], source_pos=getpos(code))
+    func_common_ir = IRnode.from_list(["seq", body, exit_], ast_source=code)
 
     tag_frame_info(func_t, context)