Add Meta-Llama-3-405B-Instruct model and update usage stats (#783)

* 🚀 Add Meta-Llama-3-405B-Instruct model and update usage stats * 📄 Add Azure AI Serverless APIs deployment documentation 💻 * removed swe
microsoft · Oct 16, 2024 · 9f45c22 · 9f45c22
1 parent 47bebce
commit 9f45c22
Show file tree

Hide file tree

Showing 6 changed files with 77 additions and 11 deletions.
diff --git a/docs/src/content/docs/getting-started/configuration.mdx b/docs/src/content/docs/getting-started/configuration.mdx
@@ -351,6 +351,55 @@ The rest of the steps are the same: Find the deployment name and use it in your
 
 </Steps>
 
+## Azure AI Serverless APIs <a id="azureai" href=""></a>
+
+Certain models in the model catalog can be deployed as [a serverless API](https://learn.microsoft.com/en-us/azure/ai-studio/how-to/deploy-models-serverless-availability) with pay-as-you-go billing.
+This kind of deployment provides a way to consume models
+as an API without hosting them on your subscription,
+while keeping the enterprise security and compliance that organizations need.
+This deployment option doesn't require quota from your subscription.
+
+<Steps>
+
+<ol>
+
+<li>
+
+Open https://ai.azure.com/ and open the **Deployments** page.
+
+</li>
+
+<li>
+
+Deploy a **base model** from the catalog.
+You can use the `Deployment Options` -> `Serverless API` option to deploy a model as a serverless API.
+
+</li>
+
+<li>
+
+Configure the **Endpoint Target URL** as the `OPENAI_API_BASE` variable and the
+`OPENAI_API_TYPE=azure` in the `.env` file.
+
+```txt title=".env"
+OPENAI_API_BASE=https://...azurewebsites.net
+OPENAI_API_TYPE=azure
+```
+
+</li>
+
+<li>
+
+Use the "large" model in your script.
+
+</li>
+
+</ol>
+
+</Steps>
+
+Note: better support will come in future versions.
+
 ## GitHub Copilot Chat Models <a id="github-copilot" href=""></a>
 
 If you have access to **GitHub Copilot Chat in Visual Studio Code**,

diff --git a/packages/core/src/chat.ts b/packages/core/src/chat.ts
@@ -505,7 +505,7 @@ async function processChatMessage(
         cancellationToken,
     } = options
 
-    stats.addUsage(req, resp.usage)
+    stats.addUsage(req, resp)
 
     if (resp.text)
         messages.push({

diff --git a/packages/core/src/pricing.json b/packages/core/src/pricing.json
@@ -166,5 +166,13 @@
     "azure:gpt-4-32k": {
         "price_per_million_input_tokens": 60,
         "price_per_million_output_tokens": 120
+    },
+    "openai:Meta-Llama-3-405B-Instruct": {
+        "price_per_million_input_tokens": 5.33,
+        "price_per_million_output_tokens": 16
+    },
+    "azure:Meta-Llama-3-405B-Instruct": {
+        "price_per_million_input_tokens": 5.33,
+        "price_per_million_output_tokens": 16
     }
 }
diff --git a/packages/core/src/types/prompt_template.d.ts b/packages/core/src/types/prompt_template.d.ts
@@ -120,6 +120,7 @@ interface ModelConnectionOptions {
         | "openai:gpt-3.5-turbo"
         | "azure:gpt-4o"
         | "azure:gpt-4o-mini"
+        | "azure:llama3-1-405"
         | "ollama:phi3"
         | "ollama:llama3"
         | "ollama:mixtral"

diff --git a/packages/core/src/usage.ts b/packages/core/src/usage.ts
@@ -5,6 +5,7 @@
 
 import {
     ChatCompletionMessageParam,
+    ChatCompletionResponse,
     ChatCompletionUsage,
     CreateChatCompletionRequest,
 } from "./chattypes"
@@ -62,8 +63,7 @@ export function estimateCost(modelId: string, usage: ChatCompletionUsage) {
  * @returns A string representation of the cost.
  */
 export function renderCost(value: number) {
-    if (isNaN(value)) return ""
-    if (value === 0) return `0$ (cached)`
+    if (!value) return ""
     return value <= 0.01
         ? `${(value * 100).toFixed(3)}¢`
         : value <= 0.1
@@ -86,6 +86,7 @@ export class GenerationStats {
     private chatTurns: {
         messages: ChatCompletionMessageParam[]
         usage: ChatCompletionUsage
+        model: string
     }[] = []
 
     /**
@@ -112,14 +113,22 @@ export class GenerationStats {
         }
     }
 
+    get resolvedModel() {
+        return this.chatTurns?.[0]?.model ?? this.model
+    }
+
     /**
      * Calculates the total cost based on the usage statistics.
      *
      * @returns The total cost.
      */
     cost(): number {
         return [
-            estimateCost(this.model, this.usage),
+            ...this.chatTurns.map(
+                ({ usage, model }) =>
+                    estimateCost(model, usage) ??
+                    estimateCost(this.model, usage)
+            ),
             ...this.children.map((c) => c.cost()),
         ].reduce((a, b) => (a ?? 0) + (b ?? 0), 0)
     }
@@ -227,7 +236,7 @@ export class GenerationStats {
         if (this.model || c) {
             const au = this.accumulatedUsage()
             logVerbose(
-                `${indent}${this.label ? `${this.label} (${this.model})` : this.model}> ${au.total_tokens} tokens (${au.prompt_tokens} -> ${au.completion_tokens}) ${renderCost(c)}`
+                `${indent}${this.label ? `${this.label} (${this.resolvedModel})` : this.resolvedModel}> ${au.total_tokens} tokens (${au.prompt_tokens} -> ${au.completion_tokens}) ${renderCost(c)}`
             )
         }
         if (this.chatTurns.length > 1)
@@ -245,11 +254,9 @@ export class GenerationStats {
      * @param req - The request containing details about the chat completion.
      * @param usage - The usage statistics to be added.
      */
-    addUsage(req: CreateChatCompletionRequest, usage: ChatCompletionUsage) {
-        if (!usage) return
-        const { model, messages } = req
-        if (this.model && model !== this.model)
-            logWarn(`model mismatch: got ${model}, expected ${this.model}`)
+    addUsage(req: CreateChatCompletionRequest, resp: ChatCompletionResponse) {
+        const { usage, model } = resp
+        const { messages } = req
 
         this.usage.completion_tokens += usage.completion_tokens ?? 0
         this.usage.prompt_tokens += usage.prompt_tokens ?? 0
@@ -264,9 +271,11 @@ export class GenerationStats {
         this.usage.completion_tokens_details.reasoning_tokens +=
             usage.prompt_tokens_details?.cached_tokens ?? 0
 
+        const { provider } = parseModelIdentifier(this.model)
         const chatTurn = {
             messages: structuredClone(messages),
             usage: structuredClone(usage),
+            model: `${provider}:${model}`,
         }
         this.chatTurns.push(chatTurn)
     }

diff --git a/packages/sample/src/swe-bench/repos/SWE-bench_Verified b/packages/sample/src/swe-bench/repos/SWE-bench_Verified