Skip to content

Commit

Permalink
Merge pull request #133 from nuprl/v5
Browse files Browse the repository at this point in the history
v0.5 proposal
  • Loading branch information
cassanof authored Apr 7, 2024
2 parents 8b683a9 + d2de08a commit 92407af
Show file tree
Hide file tree
Showing 11 changed files with 728 additions and 31 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ __pycache__
/experiments
*/local_models
cache
scratch/
13 changes: 9 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Multi-Programming Language Evaluation of Large Language Models of Code (MultiPL-E)

MultiPL-E is a system for translating unit test-driven neural code generation
benchmarks to new languages. We have used MultiPL-E to translate two popular
MultiPL-E is a system for translating unit test-driven neural code generation
benchmarks to new languages. We have used MultiPL-E to translate two popular
Python benchmarks (HumanEval and MBPP) to 18 other programming languages.

For more information:
Expand All @@ -17,7 +17,13 @@ For more information:

## Versions

- Version 0.4.0: Work in progress.
- Version 0.5.0: Instruction-following support and new languages

- New languages: Luau, Elixir, Lean, Coq, Dafny
- Support for instruction-following prompts
- vLLM support for faster evaluation

- Version 0.4.0: QoL improvements and new languages

- New languages: OCaml, MATLAB
- Using `.jsonl` instead of `.json` for prompts
Expand All @@ -28,7 +34,6 @@ For more information:
- This version corrects several bugs in prompts and test cases that resulted in lower
pass@k rates for some of the statically typed languages. The most significant difference
is that the pass@k for Java increases by about 2% on HumanEval.


- Version 0.2.0: used to evaluate [SantaCoder]

Expand Down
195 changes: 195 additions & 0 deletions autochatmodel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
"""
This script is used to generate completions via a OpenAI API. It can technically
be used with any OpenAI-API-compatible inference tool, like vLLM using the
web server.
Tested on: openai==1.12.0
"""
from typing import Dict, List
from multipl_e.completions import make_main, partial_arg_parser
from multipl_e.util import gunzip_json, gzip_json
import os
from pathlib import Path
import yaml

DEFAULT_TEMPLATE_PATH = Path(__file__).resolve(
).parent / "chat-templates" / "default.yaml"


def chat_template(interps: Dict[str, str] = {}, path=DEFAULT_TEMPLATE_PATH) -> List[Dict[str, str]]:
with open(path, "r") as f:
template: List[Dict[str, str]] = yaml.safe_load(f)

assert isinstance(template, list), "Template must be a list of messages"
assert all(
isinstance(msg, dict) for msg in template), "Each message must be a dictionary"
assert all(
"role" in msg and "content" in msg for msg in template), "Each message must have a role and content"

# interpolate the template with interps. every key in interps is a string.
# interp in the yaml content appear as ${key}.
def interpolate(content: str) -> str:
for k, v in interps.items():
content = content.replace(f"${{{k}}}", v)
return content

for msg in template:
msg["content"] = interpolate(msg["content"])
# check that no interpolation was missed
assert "${" not in msg["content"], f"Missed interpolation in {msg['content']}"

return template


def markdown_codeblock_extract(new: str) -> str:
lines = new.split("\n")
buf = ""
in_codeblock = False
for ln in lines:
if ln.startswith("```"):
if in_codeblock:
break
else:
in_codeblock = True
elif in_codeblock:
buf += ln + "\n"
return buf


def post_process(new: str) -> str:
try:
extracted = markdown_codeblock_extract(new)
except Exception as e:
print(f"Failed to extract codeblock from {new}: {e}")
extracted = new
return extracted.strip()


class OpenAIEngine:
def __init__(self, name, endpoint=None):
import openai
try:
key = os.environ.get("OPENAI_API_KEY")
except KeyError:
raise KeyError(
"Please set the OPENAI_API_KEY environment variable")
self.client = openai.Client(api_key=key, base_url=endpoint)
self.name = name

def generate(self, convos: List[List[Dict[str, str]]], max_tokens: int, temperature: float, top_p: float, stop) -> List[str]:
outputs = []
for convo in convos:
response = self.client.chat.completions.create(
model=self.name,
messages=convo, # type: ignore
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
)
o = response.choices[0].message.content
assert o is not None, "OpenAI returned a null response"
outputs.append(post_process(o))

return outputs


class VLLMEngine:
def __init__(self, name, num_gpus=1):
import torch
from vllm import LLM
dtype = "auto"
if torch.cuda.is_bf16_supported():
dtype = "bfloat16"

self.model = LLM(name, dtype=dtype, tensor_parallel_size=num_gpus)
self.tokenizer = self.model.get_tokenizer()

def generate(self, convos: List[List[Dict[str, str]]], max_tokens: int, temperature: float, top_p: float, stop) -> List[str]:
from vllm import SamplingParams
formatted = []
for convo in convos:
formatted.append(self.tokenizer.apply_chat_template(
convo, add_generation_prompt=True, tokenize=False))

outs = self.model.generate(
formatted,
SamplingParams(
top_p=top_p,
temperature=temperature,
max_tokens=max_tokens,
),
)

return [post_process(o.outputs[0].text) for o in outs]


class ChatModel:
def __init__(self, name, engine="openai", template=DEFAULT_TEMPLATE_PATH, endpoint=None, num_gpus=1):
self.template = template
if engine == "openai":
self.engine = OpenAIEngine(name, endpoint)
elif engine == "vllm":
self.engine = VLLMEngine(name, num_gpus)
else:
raise ValueError(f"Unsupported engine: {engine}")

def completions(
self, prompts: List[str], max_tokens: int, temperature: float, top_p, stop
):
# stop tokens are ignored due to instruct-based completion
prompts = [prompt.strip() for prompt in prompts]
# params = SamplingParams(temperature=temperature,
# top_p=top_p, max_tokens=max_tokens, stop=stop)
# outputs = self.model.generate(prompts, params, use_tqdm=False)
# return [stop_at_stop_token(o.outputs[0].text, stop) for o in outputs]
convo_prompts = [chat_template({"prompt": prompt}, path=self.template)
for prompt in prompts]
outputs = self.engine.generate(
convo_prompts, max_tokens, temperature, top_p, stop)

return outputs


def openai_partial_arg_parser():
args = partial_arg_parser()
args.add_argument("--name", type=str, required=True)
args.add_argument("--engine", type=str, choices=["openai", "vllm"], default="openai")
args.add_argument("--chat-template", type=str,
default=str(DEFAULT_TEMPLATE_PATH))
args.add_argument("--num-gpus", type=int, default=1)
args.add_argument("--name-override", type=str, default=None)
args.add_argument("--endpoint", type=str, default=None)
return args


def do_name_override(args):
"""
Applies the - -name-override flag, or uses the model name, correcting / and - which the rest of
the toolchain does not like.
"""
if args.name_override:
name = args.name_override
else:
name = args.name.replace("/", "_").replace("-", "_")
return name


def main():
args = openai_partial_arg_parser()
args = args.parse_args()
model = ChatModel(args.name, args.engine,
args.chat_template, args.endpoint, args.num_gpus)
name = do_name_override(args)
make_main(args, name, model.completions)
# hotpatch the results to have empty "prompt" fields
# super hacky, but it works
path = Path(args.output_dir).glob("*.json.gz")
for p in path:
data = gunzip_json(p)
assert data is not None, f"Failed to read {p}"
data["prompt"] = ""
gzip_json(p, data)


if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion automodel_vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def automodel_partial_arg_parser():
args.add_argument("--tokenizer_name", type=str)
args.add_argument("--tokenizer_revision", type=str)
args.add_argument("--name-override", type=str)
args.add_argument("--num_gpus", type=int, default=1)
args.add_argument("--num-gpus", type=int, default=1)
return args


Expand Down
8 changes: 8 additions & 0 deletions chat-templates/default.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
- role: system
content: "You are a helpful programming assistant designed to complete code snippets."
- role: user
content: |
Please generate code to complete the following problem:
```
${prompt}
```
3 changes: 3 additions & 0 deletions evaluation/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@ RUN apt-get install -yqq coq
# Lean
RUN wget https://github.com/leanprover/lean4/releases/download/v4.6.0-rc1/lean-4.6.0-rc1-linux.zip -O /tmp/lean.zip && unzip /tmp/lean.zip -d /root/lean/ && ln -s /root/lean/bin/lean /bin/lean

# install numpy for humanevalplus
RUN python3 -m pip install numpy

COPY src /code
WORKDIR /code
ENTRYPOINT ["python3", "main.py"]
Loading

0 comments on commit 92407af

Please sign in to comment.