diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index bb871d2143..161467b15a 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -5,6 +5,7 @@ on:
         branches: [main]
     pull_request:
         branches: [main]
+    workflow_dispatch:
 
 jobs:
     build:
diff --git a/.github/workflows/cli-test.yml b/.github/workflows/cli-test.yml
new file mode 100644
index 0000000000..93ffa5d67d
--- /dev/null
+++ b/.github/workflows/cli-test.yml
@@ -0,0 +1,23 @@
+name: build
+
+on:
+    workflow_dispatch:
+jobs:
+    env:
+      GPTOOLS_TOKEN: ${{ secrets.GPTOOLS_TOKEN }}
+    build:
+        runs-on: ubuntu-latest
+        steps:
+            - uses: actions/checkout@v3
+              with:
+                  submodules: "recursive"
+                  fetch-depth: 0
+            - uses: actions/setup-node@v3
+              with:
+                  node-version: "18"
+                  cache: yarn
+            - run: yarn install --frozen-lockfile
+            - name: compile
+              run: yarn compile
+            - name: run gptools
+              run: node packages/cli/built/gptools.js front-matter README.md -ot $GITHUB_STEP_SUMMARY
diff --git a/.gptools/cache/openai.gptools.jsonl b/.gptools/cache/openai.gptools.jsonl
index a59a0640e0..62b2d37b9f 100644
--- a/.gptools/cache/openai.gptools.jsonl
+++ b/.gptools/cache/openai.gptools.jsonl
@@ -2,3 +2,4 @@
 {"sha":"51f1eef52de4e2b4d31e48f07b8282711f1ea367c7d75abae255e29fb57dcca3","key":{"model":"gpt-4-32k","temperature":0.3,"max_tokens":10000,"messages":[{"role":"system","content":"You are concise. Answer in markdown.\n\n\nWhen explaining answers, take a deep breath.\n\n\nWhen generating or updating files you will use the following syntax:\n\nFile ./file1.ts:\n```\nWhat goes in\n./file1.ts.\n```\n\nFile ./file1.py:\n```\nWhat goes in\n./file1.py.\n```\n\nFile /path_to_file/file2.md:\n`````\nWhat goes in\n/path_to_file/file2.md.\n`````\n\nMake sure to use precisely ``` to guard file code sections.\n\nMake sure to use precisely ````` to guard file markdown sections.\n\nUse full path of filename in code section header.\n\nIf a file does not have changes, do not regenerate.\n\n\nAdditionally, generate a concise SUMMARY of the changes in all files using the following syntax:\n\nSUMMARY:\n```\nThis is a summary of the changes in files.\n```\n\n\n"},{"role":"user","content":"SUBMISSION:\n````` file=packages/sample/src/tennis.gpspec.md\n# Tennis Score App\n\nThe web application enables users to input\ntennis match scores and displays the current score.\n`````\n\nYou are an expert academic reviewer for the SUBMISSION conference. \nWrite a review of ARTICLE.\n\nBe systematic. \nProvide missing references. \nExplain your steps. \nSuggest improvements. \nProvide an acceptance grade.\n\nUse this syntax to format your review:\n```\n- original:\n> This is the original sentence.\n- updated:\n> This is the updated sentence.\n- explanation:\nThis is why i rephrased this sentence.\n```\n\n"}]},"val":"REVIEW:\n\n```\n- original:\n> The web application enables users to input tennis match scores and displays the current score.\n\n- updated:\n> The web application provides an interface for users to input tennis match scores, and it dynamically displays the current score.\n\n- explanation:\nThe original sentence was slightly vague. The updated sentence provides more clarity by specifying that the application provides an interface for input and that the score display is dynamic.\n\n- missing references:\nNone.\n\n- improvements:\nThe description could benefit from more details about the application's features. For example, does it support different match formats or player numbers? Does it have any additional functionality like statistics or history tracking?\n\n- acceptance grade:\nAccept with minor revisions. The description is generally clear and concise, but it could be more informative.\n```"}
 {"sha":"8c47eb78733553e48d6c0844dff255badcf7884aae5a088bb92522c13cc6ac98","key":{"model":"gpt-4-32k","temperature":0.3,"max_tokens":10000,"messages":[{"role":"system","content":"You are concise. Answer in markdown.\n\n\nWhen explaining answers, take a deep breath.\n\n\nWhen generating or updating files you will use the following syntax:\n\nFile ./file1.ts:\n```\nWhat goes in\n./file1.ts.\n```\n\nFile ./file1.py:\n```\nWhat goes in\n./file1.py.\n```\n\nFile /path_to_file/file2.md:\n`````\nWhat goes in\n/path_to_file/file2.md.\n`````\n\nMake sure to use precisely ``` to guard file code sections.\n\nMake sure to use precisely ````` to guard file markdown sections.\n\nUse full path of filename in code section header.\n\nIf a file does not have changes, do not regenerate.\n\n\nAdditionally, generate a concise SUMMARY of the changes in all files using the following syntax:\n\nSUMMARY:\n```\nThis is a summary of the changes in files.\n```\n\n\n"},{"role":"user","content":"SUBMISSION:\n````` file=packages/sample/src/demovideo.gpspec.md\n# Demo video\n\nThis file contains a text description of the gptools demo file.\n\n-   [demovideo.dialog.md](./demovideo.dialog.md)\n-   [demovideo.transcript.md](./demovideo.transcript.md)\n-   [gptools info](../../whitepaper/gptools-overview.md)\n`````\n\nARTICLE:\n````` file=packages/sample/src/demovideo.dialog.md\nEthan: Welcome to our presentation of GP tools in Visual Studio Code. We'll start by opening the GP spec file, which is a markdown file where we describe the task to do. In this example, we're going to implement an email validator.\n\nNancy: That's right, Ethan. The task is to write a function that takes a string and validates if the whole string is a valid email address. This description is in natural language, in what we call the GP spec file.\n\nEthan: Now, this GP spec file is run against a GP tool. The GP tools are JavaScript files defined in the project. Let's open the generate python GP tool, which has some metadata used to populate the UI, a programmatic way to access the context, and the typical prompt engineering using an intuitive syntax.\n\nNancy: From the user's perspective, they can take the code action tool and select the \"generate python code\" option. This will execute the GP tool against the GP spec and the rest of the context. The results are streamed in Visual Studio Code and previewed as a refactoring.\n\nEthan: That's right. We can see that a new file is parsed out, and an update to the file is made. If the user is happy with the changes, they can apply them just like any other refactoring. Next, they can run the test generation tool, which is another JavaScript file in the project.\n\nNancy: When running the test generation tool, we can see that it generates tests but uses the wrong framework. To fix this, the team can update the tool file. After updating the file, they can run the test tool again, and now it picks up the right test framework.\n\nEthan: Once the tests are generated, they can make sure the tests are passing. Now, let's say the customer changes their mind and decides that the function should also validate URLs. The user can update the spec to reflect this change and run the tool again.\n\nNancy: When running the tool again, it initially uses the wrong file. The user can refine the spec to use the same pipeline file, run the tool again, and now it regenerates the email validator. They can review the changes and apply them if they're satisfied.\n\nEthan: And that's it! Thank you for watching our presentation on GP tools in Visual Studio Code.\n`````\n\nARTICLE:\n````` file=packages/sample/src/demovideo.transcript.md\n# Transcript\n\n  This is a presentation of GP tools in visual studio code.  And we start by opening the GP spec file, which is a markdown file word to use a described as a task to do. And this example, we're going to have. Implement. An email validator.  We're the task is to write a function. taKes a string and.  Validates that the whole string.  Is a valid email address.  \n\nSo that's the description of the problem in natural language, in what we call the GP spec file.  And these GP fact file.  Is run against a GP tool. The GP tools are JavaScript files defined in the project. We're going to open the generate piped on GP tool.  Which has some metadata, which is used to populate the UI.  It has programattic way to access the, the context, including the content of the GP. B GP spec file.  And then it has the typ typical prompt engineering. Thing here using an intuitive syntax.  \n\nfRom the user, who's writing the GP spec. We go in here and we take the code action tool. And we select the generate pipeline code. So this is going to. Execute the GP tool.  Against the GP spec and the rest of the context, the results are streamed.  In visual studio code and they're also previewed as a refactoring. So we can see that we parsed out. And you file here and we parsed out an update to the file. So if you're happy with that, you can go ahead and apply. The chains are applied, just like any other refactoring. And then why don't we can do now is go in and run the next tool, which is going to be. Test generation tool. So this is another JavaScript filed in project. We run that this is a different prompt. The prompt is about generating test.  Now, in this case, you can see that. It is generating tasks, but using the wrong fair mark.  So this is something that.  We want to fix, add team level. So we're going to go into the tools.  And we're going to update the tool. And this is something that.  THe team is using to generate test. So there we go. You used a union testimonial term for writing tests? We have updated.  The file that the tool file.  \n\nNow let's go back and run the.  Run the test tool again.  \n\nCan we update it?  To fight against. See, now that it is actually.  Picking up the right test framework, which is using our team.  Generating test cases.  \n\nCan I apply that?  Let's get, make sure our tests, our tests are passing.  Now the customer changes mind. And decided that we also had to validate your ELLs. So we're going to go and update. The spec. So write a function via email address or your.  All right. And now I can run this.  \n\nBy applying the tool.  Jerry And code.  And we can see that it is. It is using the wrong, the wrong file. So I'm going to refine my spec and say that we should use the same pipeline file. This goes into the GSB spec file and run again.  \n\nAnd now we're regenerating email validator, which we already have here.  \n\nAnd we have a new refactoring, and if we can look at the diff, we can see that we've actually.  We've kept that function but we've added his valid url and then we also have the function that combines the two that the change does. good we can click apply  And that's it  Thank you for watching \n`````\n\nARTICLE:\n````` file=packages/whitepaper/gptools-overview.md\n---\ntitle: gptools - AI-Enhanced Tools for Human Workflows\ndescription: gptools is a framework that enables teams to create and use AI-enhanced scripts to support their workflows.\nkeywords: gptools, AI scripts, workflows, foundation models, LLMs\n---\n# gptools: Empowering Human Workflows with AI-Enhanced Tools\n\n-   Authors: Peli de Halleux, Michał Moskal, Ben Zorn\n-   Date: October 2023\n-   Repository: [gptools](https://github.com/microsoft/gptools/tree/main)\n\n## Abstract\n\ngptools is a framework that empowers teams, including non-developers, to create and use AI-enhanced scripts to support their workflows. gptools provides support for creating, understanding, and maintaining complex collections of documents such as software repositories, project management, etc. Our framework leverages foundation models (specifically LLMs) to enable a new kind of scripting that combines traditional code and natural language. The key elements of the gptools framework are **gpspecs** and **gptools**. gpspecs are natural language documents that describe specific user tasks to accomplish. gptools are general scripts that combine traditional software and natural language prompts and rely on an LLM to generate results. We execute gptools in the context of a gpspec to achieve a result. By separating these two abstractions, we allow gptools to be authored, maintained, and updated independently of the gpspecs that use them. The gptools framework includes an execution engine to execute a gptool and an IDE extension to VS Code to support user interaction with the gptool. We believe that gptools will empower even non-developers to automate their workflows in ways that were previously impossible.\n\n## Introduction\n\nThis document describes the gptools framework, which empowers teams, including non-developers, to use AI-enhanced scripts to support their efforts in creating, understanding, and maintaining complex artifacts. gptools leverages foundation models (LLMs) to enable a new kind of scripting that combines traditional code and natural language. To understand how gptools works, we provide an example. The key elements of gptools include gpspecs, gptools, the gpvm, and the gptools extension to VS code.\n\n```mermaid\ngraph LR\nF[User:\\nvia VSCode\\n UI] --> G[gpspec:\\nwrite email\\n recognizer]\nG --> A[gptool:\\ngenerate\\npython code]\nA --> C[gpvm pre:\\nexpand gpspec\\nand gptool\\ninto prompt]\nC --> D[Foundation Model]\nD --> E[gpvm post:\\nextract\\npython code]\nE --> F\n```\n\nThis diagram illustrates the workflow of gptools with a concrete example. The user wants to write a python function that recognizes well-formed email addresses and writes a gpspec. She then invokes a gptool, one that will generate python code from a spec, on that gpspec, much as a user might call a python script with some arguments. The gpvm composes the gpspec with the gptool creating an LLM prompt, which in turn is passed to the foundation model. The foundation model returns a result (python code) to the gpvm, which postprocesses it, creates the requested python file, and notifies the user.\n\nThis document will motivate the design of the gptools framework, illustrate its use with examples, and describe elements of implementation.\n\n## Complex Artifacts Require Complex Workflows\n\n-   Software development is a complex process that requires the coordination of many different activities.\n-   Historically, software development has been a highly manual process, with developers using a variety of tools to create and maintain the artifacts that comprise a software system.\n-   Over time abstractions have been developed to help manage the complexity of software development.\n    -   Important examples include: Unix utilities and pipes, makefiles, build scripts, etc.\n-   Modern software development includes many automated processes as well as manual processes such as code review, design review, bug triage, etc.\n\n## Foundation Models Create New Opportunities\n\n-   The recent development of foundation models (aka LLMs) have created new opportunities for automating complex workflows.\n-   AI has important advantages over traditional software:\n    -   AI models can perform tasks normal software cannot\n    -   AI models can be instructed using natural language, allowing non-programmers to use them\n-   AI models also have disadvantages:\n    -   AI models are not perfect, and can make mistakes\n    -   AI models are not transparent, and it is difficult to understand why they make the decisions they do\n-   AI models are best used to augment human workflows, not replace them\n\n## gptools - a Framework for AI-Enhanced Workflows\n\nVision: empower teams, including non-developers, to use AI-enhanced scripts to support their efforts to create, understand, and maintain complex artifacts\n\nGoals: support tool abstraction, modularity, reuse, but at the same time empower non-developers to author, maintain, and update AI-enhanced scripts\n\nApproach: Foundation models enable a new kind of scripting that allows script writers to achieve both greater functionality and greater ease of use. We separate scripts into two related parts: a generic reusable _gptool_ and a natural language _gpspec_ that instantiates the gptool in a particular context\n\nKey elements of gptools:\n\n-   _gptools_ – Scripts that integrate traditional code and natural language and leverage foundation models in their execution\n-   _gpspecs_ – Natural language documents that instantiate gptools in a particular context\n-   _gpvm_ – A framework and runtime system that executes gpspecs and gptools\n-   _gptools extension to VS code_ – supporting seamless user interaction with gptools\n\n## gptool: A New Kind of Script\n\n-   A gptool is a script with the following components\n\n    -   A header that contains metadata related to the execution of the script (e.g., information about what LLM model to use, etc.)\n    -   A natural language prompt intended to be processed by a foundation model\n    -   An environment context that augments the natural language with additional data/information\n    -   Programming language constructs that are used to programmatically manipulate both inputs and outputs\n\n-   Benefits of gptools\n    -   By separating the gptool from the gpspec, we allow gptools to become highly engineered shared artifacts, forming the basis of shared libraries.\n    -   A gptool can be used in many different contexts, and can be updated and maintained independently of the gpspecs that use it.\n    -   By separating the gptool from the gpspec, we allow non-developers to use gptools without having to understand the details of how they work.\n    -   At the same time, because a gptool contains a natural language prompt, it is easy for a non-developer to understand what the gptool does, and to modify it to suit their needs.\n\n## gpspec: Natural Language to Invoke a gptool\n\nJust as a chat enables a user to interact with an AI model, a gpspec is a natural language markdown document that defines a context in which to invoke a gptool.\n\nA gpspec is a standard markdown file, with the following additional elements:\n\n-   Links to context elements that define the context in which a particular gptool is to be invoked. For example, a gpspec might contain links to markdown files, code files, etc.\n-   Natural language describing the specific task to be performed as input to a gptool. For example, the spec used to generate code would include a description of the functionality and might include a description style guidelines to observe, etc.\n\n. A single gpspec file might be used as input to multiple gptools. For example, a gpspec might be used to generate code, documentation, and tests, each using a different gptool.\n\n## Expanding the Example\n\nTo better understand how gptools work, we expand the example given above. The diagram below illustrates the gptools workflow in greater detail. Our design of gptools is based on the following principles:\n\n-   gptools are used in a context where there is human oversight on the content generated. As a result, our workflow starts and ends with the user.\n-   We assume that the output of a gptool may be incomplete or incorrect, and that the user will need to interact with the output to refine it. As a result, we allow the user to accept/reject and directly modify the AI-generated content.\n-   We assume that the user will want to understand how the AI model was used to generate the results. We provide a trace of how the gpvm composes the gpspec and gptool into a prompt that can be processed by the foundation model.\n-   We support iterative development, where the user can both edit the gpspec and the gptool if they see opportunities to improve the results.\n\n```mermaid\nsequenceDiagram\nparticipant User\nparticipant VSCode\nparticipant gpspec\nparticipant gptool\nparticipant gpvm\nUser->>VSCode: Create/Edit gpspec\nVSCode->>gpspec: Save gpspec\nUser->>VSCode: Invoke gptool\nVSCode->>gptool: Execute gptool with gpspec context\ngptool->>gpvm: Request foundation model execution\ngpvm->>gptool: Return AI-generated output\ngptool->>VSCode: Update context with output\nVSCode->>User: Display updated context\n```\n\nThis diagram demonstrates the AI-enhanced workflow process in gptools. The gpspec instantiates the gptool, which interacts with the gpvm and foundation model. The AI-generated output is used to update the context, and the user interacts with the updated context through the gptools extension to VS code.\n\n### gptool Example: Python Developer gptool\n\nThis is an example of a simple gptool that generates python code from a gpspec file:\n\n```javascript\ngptool({\n    title: \"Generate python code\",\n    model: \"gpt-4\",\n    description: \"Given a task, generate python code.\",\n})\n\ndef(\n    \"CODE\",\n    env.links.filter(\n        (f) => f.filename.endsWith(\".py\") && !f.filename.startsWith(\"test_\")\n    )\n)\ndef(\"TASK\", env.file)\n\n$`Generate python code for the task in TASK. Save code in CODE. If the CODE is already present, ensure that CODE matches the description in TASK and make changes to CODE if it does not.`\n```\n\nIn this example we see the following elements:\n\n-   A header that contains metadata related to the execution of the script (e.g., information about what LLM model to use, etc.)\n-   JavaScript code that manipulates the environment context, specifically by extracting only the python files from the context that do not start with \"test\\_\".\n-   Variable definitions that allow the prompt to refer to elements in the context. TASK refers to the gpspec file, and CODE refers to the python files in the context.\n-   Natural language that combines these elements.\n\n### gpspec Example: Using the Python Developer gptool\n\nThis is an example of a gpspec file that uses the python generating gptool shown above:\n\n```markdown\n# email address and URL recognizer\n\nWrite a function that takes a string argument and returns true if the whole string is a valid email address, false otherwise.\n```\n\nIn this example, there is no additional context needed to invoke the python developer gptool. In other cases, the gpspec file will contain links to additional context elements, such as markdown, code files, etc.\n\n## gpvm - A Framework for Executing gpspecs and gptools\n\nEvery system that interacts with a foundation model includes layers that transform user input into a prompt that can be processed by the foundation model, and layers that transform the output of the foundation model into a form that is useful to the user.\n\n-   gpvm is a runtime environment that:\n    -   Captures the context defined by the gpspec\n    -   Executes whatever code is present in the gptool (in the example above, the variables CODE and TASK are defined by executing JavaScript code)\n    -   Expands the gpspec and natural language of the gptool into a prompt that can be processed by the foundation model\n    -   Sends the results to the AI model\n    -   Processes the results on return to update the user context (which might include creating files, updating files, generating user feedback, etc.)\n\nUnlike many AI model orchestration frameworks, such as Semantic Kernel, gpvm does not leverage the model to do chain-of-thought reasoning or utomatically stage multiple invocations of the AI model during its execution. gptools currently relies on the user to choose and invoke the appropriate gptool as needed.\n\n## gptools Extension to VS Code\n\nWe believe that human oversight of AI models is essential to their effective use. To support this, we have created a VS Code extension that allows a user to interact with a gpspec and gptool in a natural way.\n\nThe extension provides the following capabilities:\n\n-   A command palette that allows a user to select a gptool to invoke in the context of a given gpspec file\n-   A token management system that supports connecting with the AI model of interest\n-   An invocation of the gpvm to process the user input and generate results\n-   A user interface that allows the user to interact with the AI model to refine the results\n-   A gptool trace viewing mechanism that allows users to understand how the AI model was used to generate the results\n\n## Implications of gptools\n\nThe existence of powerful programming tools based on AI that are usable by non-developers is transformative.\n\nJust as the development of JavaScript enabled Web 2.0, and python enabled the creation of the current AI software ecosystem, gptools will fuel a new generation of AI-enhanced applications.\n\nWe envision the creation of gptools for many different verticals, with opportunities for customization and authoring at many levels of expertise:\n\n-   Professional developers and architects will define collections of gptools for a given vertical just as packages are authored and maintained today\n-   Professional developers can author and maintain individual gptools\n-   Developers and non-developers can customize gptools for their particular organization or application needs\n-   Non-developers can author and maintain gpspecs for their particular projects\n\nFurthermore, the existence of gptools can empower non-developers to automate their workflows in ways that were previously impossible.\n\n## Related Work\n\ngptools are related to a number of foundation model projects that automate workflows and encapsulate LLM actions in software artifacts. We describe some of these projects below:\n\n### AI Tools for Developers\n\n-   [GitHub Copilot](https://copilot.github.com/) is a VS Code extension that uses the Codex LLM to suggest code completions based on the context of the current file. Copilot provides LLM support for writing code but does not currently have an extensibility model or a way to create, maintain and apply collections of AI-enhanced scripts.\n-   [codeplan](https://arxiv.org/abs/2309.12499) is a task-agnostic framework that frames repository-level coding as a planning problem, synthesizing a multi-step chain of edits where each step results in a call to an LLM on a code location with context derived from the entire repository, previous code changes, and task-specific instructions. While codeplan uses LLM prompts to generate code, it does not provide a way to create, maintain and apply collections of AI-enhanced scripts.\n\n### LLM Automation/Orchestration Frameworks\n\n-   [autogpt](https://github.com/Significant-Gravitas/AutoGPT) is an AI agent that utilizes a large language model to drive its actions and decisions. Unlike traditional language model applications, AutoGPT does not require repeated prompting by a human and can autonomously develop and manage tasks.\n-   [langchain](https://www.langchain.com/) is a Python library designed to simplify the development of Natural Language Processing (NLP) applications using large language models. It provides a framework for connecting language models to other sources of data and allowing them to interact with their environment.\n-   [semantic kernel](https://github.com/microsoft/semantic-kernel) is an open-source SDK that allows developers to easily integrate AI services with conventional programming languages such as C# and Python. It provides a framework for connecting language models to other sources of data, allowing them to interact with their environment and perform tasks autonomously.\n-   [autogen](https://arxiv.org/abs/2308.08155) is a framework for simplifying the orchestration, optimization, and automation of large language model (LLM) workflows. It offers customizable and conversable agents that leverage LLMs while addressing their limitations by integrating with humans and tools and having conversations between multiple agents via automated chat.\n\ngptools differ from existing AI task automation frameworks in the following ways:\n\n-   gptools are designed to be authored, maintained, and applied by non-developers as well as developers.\n-   gptools define a separation between the gptool and the gpspec that instantiates it, allowing for modularity, reuse, and easier understanding by non-developers.\n-   gptools do not assume task automation and provide a user experience that integrates seamlessly with VS code.\n-   The gptools UI exposes both the content and execution of each gptools to facilitate human oversight and understanding of how the AI model was used to generate the results.\n`````\n\nYou are an expert academic reviewer for the SUBMISSION conference. \nWrite a review of ARTICLE.\n\nBe systematic. \nProvide missing references. \nExplain your steps. \nSuggest improvements. \nProvide an acceptance grade.\n\nUse this syntax to format your review:\n```\n- original:\n> This is the original sentence.\n- updated:\n> This is the updated sentence.\n- explanation:\nThis is why i rephrased this sentence.\n```\n\n"}]},"val":"REVIEW:\n\nThe articles submitted provide a detailed overview of the gptools framework and its usage in Visual Studio Code. The authors have done a commendable job in explaining the concept, its implementation, and its potential implications. However, there are a few areas that could be improved for clarity and completeness.\n\n1. In the `demovideo.dialog.md` and `demovideo.transcript.md` files, the dialogue and transcript could be better formatted for readability. \n\n- original:\n> Ethan: Welcome to our presentation of GP tools in Visual Studio Code. We'll start by opening the GP spec file, which is a markdown file where we describe the task to do. In this example, we're going to implement an email validator.\n\n- updated:\n> **Ethan**: Welcome to our presentation of GP tools in Visual Studio Code. We'll start by opening the GP spec file, which is a markdown file where we describe the task to do. In this example, we're going to implement an email validator.\n\n- explanation:\nBy bolding the speaker's name, it makes it easier for the reader to follow the dialogue.\n\n2. In the `gptools-overview.md` file, the authors could provide references to the tools and concepts mentioned for the benefit of readers unfamiliar with them.\n\n- original:\n> gptools is a framework that empowers teams, including non-developers, to create and use AI-enhanced scripts to support their workflows.\n\n- updated:\n> gptools is a framework that empowers teams, including non-developers, to create and use AI-enhanced scripts to support their workflows [1].\n\n- explanation:\nAdding a reference to the original paper or documentation of gptools will help readers who want to delve deeper into the topic.\n\n3. The authors could also consider adding a section on the limitations and potential challenges of using gptools. This would provide a more balanced view of the framework.\n\n- original:\n> We believe that gptools will empower even non-developers to automate their workflows in ways that were previously impossible.\n\n- updated:\n> We believe that gptools will empower even non-developers to automate their workflows in ways that were previously impossible. However, it is important to note that there may be limitations and challenges in its implementation, such as [insert limitations here].\n\n- explanation:\nDiscussing potential limitations and challenges gives a more balanced view of the framework and prepares users for possible issues they might encounter.\n\nOverall, the articles are well-written and provide a comprehensive overview of gptools. With a few minor improvements, they could be even more informative and useful for the readers.\n\nAcceptance Grade: Accept with minor revisions."}
 {"sha":"51f1eef52de4e2b4d31e48f07b8282711f1ea367c7d75abae255e29fb57dcca3","key":{"model":"gpt-4-32k","temperature":0.3,"max_tokens":10000,"messages":[{"role":"system","content":"You are concise. Answer in markdown.\n\n\nWhen explaining answers, take a deep breath.\n\n\nWhen generating or updating files you will use the following syntax:\n\nFile ./file1.ts:\n```\nWhat goes in\n./file1.ts.\n```\n\nFile ./file1.py:\n```\nWhat goes in\n./file1.py.\n```\n\nFile /path_to_file/file2.md:\n`````\nWhat goes in\n/path_to_file/file2.md.\n`````\n\nMake sure to use precisely ``` to guard file code sections.\n\nMake sure to use precisely ````` to guard file markdown sections.\n\nUse full path of filename in code section header.\n\nIf a file does not have changes, do not regenerate.\n\n\nAdditionally, generate a concise SUMMARY of the changes in all files using the following syntax:\n\nSUMMARY:\n```\nThis is a summary of the changes in files.\n```\n\n\n"},{"role":"user","content":"SUBMISSION:\n````` file=packages/sample/src/tennis.gpspec.md\n# Tennis Score App\n\nThe web application enables users to input\ntennis match scores and displays the current score.\n`````\n\nYou are an expert academic reviewer for the SUBMISSION conference. \nWrite a review of ARTICLE.\n\nBe systematic. \nProvide missing references. \nExplain your steps. \nSuggest improvements. \nProvide an acceptance grade.\n\nUse this syntax to format your review:\n```\n- original:\n> This is the original sentence.\n- updated:\n> This is the updated sentence.\n- explanation:\nThis is why i rephrased this sentence.\n```\n\n"}]},"val":"REVIEW:\n\n```\n- original:\n> The web application enables users to input tennis match scores and displays the current score.\n\n- updated:\n> The web application facilitates users to input tennis match scores and dynamically displays the current score.\n\n- explanation:\nThe original sentence was clear but lacked the dynamic nature of the application. The updated sentence emphasizes the application's ability to update scores in real-time.\n\n- missing references:\nNo references were provided in the article. It would be beneficial to include references to similar applications or research that supports the need or effectiveness of such an application.\n\n- improvements:\nThe article could benefit from a more detailed description of the application's features and its user interface. It would also be helpful to include information about the technology stack used to develop the application.\n\n- acceptance grade:\nGiven the lack of detail and references, I would grade this submission as \"Requires Major Revisions\".\n```\n"}
+{"sha":"81376a7452484fb681b9e7d9330f2641905b5860d6d339b0001dff61ad222f66","key":{"model":"gpt-4-32k","temperature":0,"max_tokens":4000,"messages":[{"role":"system","content":"---\nYou are concise. Answer in markdown.\n\n\n---\nUse the following format to create **file annotations** (same as GitHub Actions workflow). Group annotations together.\n\nANNOTATION:\n```\n::(notice|warning|error) file=<filename>,line=<start line>,endLine=<end line>::<message>\n```\n\nFor example, an warning in main.py on line 3 with message \"typo\" would be:\n\nANNOTATION:\n```\n::warning file=main.py,line=3,endLine=3::typo\n```\n\nFor example, an error in app.js between line 1 and 4 with message \"Missing semicolon\" and a warning in index.ts on line 10, would be:\n\nANNOTATION:\n```\n::error file=app.js,line=1,endLine=4::Missing semicolon\n::warning file=index.ts,line=10,endLine=10::identation\n```\n\n\n"},{"role":"user","content":"CODE:\n``` file=packages/sample/src/counting.py\n[1] # This is my favorite program!\n[2] \n[3] for i in range(1, 45):\n[4]     print(i)\n[5] \n[6] for c in range(ord('a'), ord('z') + 1):\n[7]     print(chr(c))\n[8] \n[9] # And now, at the end:\n[10] print(\"The end\")\n```\n\nYou are an EXPORT software developer with deep knowledge of all programming languages.\n\nYour job is to do a code review of CODE and create ANNOTATION with code improvement notice, warning and errors. \n- No more than 5 annotations per code file.\n- Consider readability, maintainability, performance, security, and correctness. \n- The code in CODE is written by a novice programmer.\n\nDo your best and will get a large tip. $$$.\n\n"}]},"val":"ANNOTATION:\n```\n::notice file=packages/sample/src/counting.py,line=1,endLine=1::Consider adding a more descriptive comment about what the program does.\n::warning file=packages/sample/src/counting.py,line=3,endLine=4::Use list comprehension for better performance and readability.\n::notice file=packages/sample/src/counting.py,line=6,endLine=7::Use list comprehension for better performance and readability.\n::error file=packages/sample/src/counting.py,line=6,endLine=7::Avoid using magic numbers ('a' and 'z'). Consider using constants for better readability and maintainability.\n::notice file=packages/sample/src/counting.py,line=10,endLine=10::Consider adding a newline at the end of the file for better compatibility across different platforms and editors.\n```"}
diff --git a/docs/cli.md b/docs/cli.md
index 5922b7d2f7..a01c82297f 100644
--- a/docs/cli.md
+++ b/docs/cli.md
@@ -26,12 +26,28 @@ or configured through the `keys` command (as a JSON payload).
 
 Output the entire response as JSON to the stdout.
 
-### --output <file>
+### --out <file>
 
 Saves the results in a JSON file, along with mardown files of the output and the trace.
 
 ```bash
-node gptools.js run <tool> <spec> --output <file>
+node gptools.js run <tool> <spec> --out <file>
+```
+
+### --out-trace <file>
+
+Save the markdown trace to the specified file.
+
+```bash
+node gptools.js run <tool> <spec> --out-trace <file>
+```
+
+In a GitHub Actions workflow, you can use this feature to save the trace as a step summary (`GITHUB_STEP_SUMMARY`):
+
+```yaml
+- name: Run GPTools tool on spec
+  run: |
+      node gptools.js run <tool> <spec> --out-trace $GITHUB_STEP_SUMMARY
 ```
 
 ### --dry-run
diff --git a/packages/cli/src/main.ts b/packages/cli/src/main.ts
index 8d4bcb0c3d..4aa19a3f61 100644
--- a/packages/cli/src/main.ts
+++ b/packages/cli/src/main.ts
@@ -1,4 +1,5 @@
 import {
+    FragmentTransformResponse,
     RequestError,
     clearToken,
     host,
@@ -55,6 +56,7 @@ async function run(
         json: boolean
         maxDelay: string
         dryRun: boolean
+        outTrace: string
     }
 ) {
     const out = options.out
@@ -62,6 +64,7 @@ async function run(
     const retry = parseInt(options.retry) || 3
     const retryDelay = parseInt(options.retryDelay) || 5000
     const maxDelay = parseInt(options.maxDelay) || 180000
+    const outTrace = options.outTrace
 
     const toolFiles: string[] = []
     if (/.gptool\.js$/i.test(tool)) toolFiles.push(tool)
@@ -99,7 +102,8 @@ async function run(
     )
     if (!gpspec) throw new Error(`spec ${spec} not found`)
     const fragment = gpspec.roots[0]
-    const res = await backOff(
+
+    const res: FragmentTransformResponse = await backOff(
         async () =>
             await runTemplate(gptool, fragment, {
                 infoCb: (progress) => {},
@@ -121,6 +125,7 @@ async function run(
         }
     )
 
+    if (outTrace && res.trace) await writeText(outTrace, res.trace)
     if (out) {
         const jsonf = /\.json$/i.test(out) ? out : out + ".json"
         const userf = jsonf.replace(/\.json$/i, ".user.md")
@@ -147,6 +152,8 @@ async function run(
             console.log(user)
         } else console.log(res.text)
     }
+
+    if (res.error) throw res.error
 }
 
 async function listTools() {
@@ -183,7 +190,8 @@ async function main() {
         .command("run")
         .description("Runs a GPTools against a GPSpec")
         .arguments("<tool> [spec]")
-        .option("-o, --out <string>", "output file")
+        .option("-o, --out <string>", "output file. Extra markdown fiels for output and trace will also be generatred")
+        .option("-ot, --out-trace <string>", "output file for trace")
         .option("-r, --retry <number>", "number of retries", "3")
         .option("-j, --json", "emit full JSON response to output")
         .option(
diff --git a/packages/core/src/expander.ts b/packages/core/src/expander.ts
index 89a0b18dcb..65e8eefbdc 100644
--- a/packages/core/src/expander.ts
+++ b/packages/core/src/expander.ts
@@ -64,6 +64,11 @@ export interface FragmentTransformResponse {
      * Summary of the output generated by the LLM
      */
     summary?: string
+
+    /**
+     * Error message if any
+     */
+    error?: unknown
 }
 
 function trimNewlines(s: string) {
@@ -473,6 +478,7 @@ export async function runTemplate(
     // if the expansion failed, show the user the trace
     if (!success) {
         return {
+            error: new Error("Template failed"),
             prompt,
             vars,
             trace,
@@ -559,7 +565,17 @@ The user requested to cancel the request.
                 fileEdits: {},
             })
         }
-        throw error
+
+        return {
+            error,
+            prompt,
+            vars,
+            edits: [],
+            annotations: [],
+            fileEdits: {},
+            trace,
+            text,
+        }
     }
 
     trace += details("LLM response", fenceMD(text))
@@ -703,5 +719,25 @@ The user requested to cancel the request.
         })
     }
 
+    if (edits.length)
+        res.trace += details(
+            "edits",
+            `| Type | Filename | Message |\n| --- | --- | --- |\n` +
+                edits
+                    .map((e) => `| ${e.type} | ${e.filename} | ${e.label} |`)
+                    .join("\n")
+        )
+    if (annotations.length)
+        res.trace += details(
+            "annotations",
+            `| Severity | Filename | Line | Message |\n| --- | --- | --- | --- |\n` +
+                annotations
+                    .map(
+                        (e) =>
+                            `| ${e.severity} | ${e.filename} | ${e.range[0]} | ${e.message} |`
+                    )
+                    .join("\n")
+        )
+
     return res
 }
diff --git a/packages/vscode/src/state.ts b/packages/vscode/src/state.ts
index 563f561cd9..7f254dd39f 100644
--- a/packages/vscode/src/state.ts
+++ b/packages/vscode/src/state.ts
@@ -298,6 +298,7 @@ ${e.message}`
             .then((resp) => {
                 r.response = resp
                 r.computing = false
+                if (resp.error) r.error = resp.error
             })
             .catch((e) => {
                 r.computing = false