examples/low_level_api/low_level_api_chat_cpp.py

"""
This is an example implementation of main.cpp from llama.cpp
Quirks:
 * Its not exactly alike since this port is designed around programmatic I/O
 * Input is always echoed if on, so it should be turned off when using "input()"
 * The first antiprompt should be the userprompt like "\nUser:", 
   because its added when n_predict is reached (aka generation ended prematurely)
 * n_predict can be set to -1 for unlimited length responses (or just a really high value)
 * Instruction mode adds its own antiprompt.
   You should also still be feeding the model with a "primer" prompt that 
   shows it the expected format.
"""

import ctypes
import sys
from time import time
from os import cpu_count, path

import llama_cpp
from common import GptParams, gpt_params_parse, gpt_random_prompt
import util


# A LLaMA interactive session
class LLaMAInteract:
    def __init__(self, params: GptParams) -> None:
        # input args
        self.params = params
        if self.params.path_session is None:
            self.params.path_session = ""
        if self.params.antiprompt is None:
            self.params.antiprompt = ""

        if self.params.perplexity:
            raise NotImplementedError(
                """************
please use the 'perplexity' tool for perplexity calculations
************"""
            )

        if self.params.embedding:
            raise NotImplementedError(
                """************
please use the 'embedding' tool for embedding calculations
************"""
            )

        if self.params.n_ctx > 2048:
            print(
                f"""warning: model does not support \
context sizes greater than 2048 tokens ({self.params.n_ctx} \
specified) expect poor results""",
                file=sys.stderr,
            )

        if self.params.seed <= 0:
            self.params.seed = int(time())

        print(f"seed = {self.params.seed}", file=sys.stderr)

        if self.params.random_prompt:
            self.params.prompt = gpt_random_prompt(self.params.seed)

        # runtime args
        self.input_consumed = 0
        self.n_past = 0
        self.n_session_consumed = 0
        self.first_antiprompt = []
        self.remaining_tokens = self.params.n_predict
        self.output_echo = self.params.input_echo
        self.multibyte_fix = []

        # model load
        self.lparams = llama_cpp.llama_model_default_params()
        self.lparams.n_ctx = self.params.n_ctx
        self.lparams.n_parts = self.params.n_parts
        self.lparams.seed = self.params.seed
        self.lparams.memory_f16 = self.params.memory_f16
        self.lparams.use_mlock = self.params.use_mlock
        self.lparams.use_mmap = self.params.use_mmap

        self.model = llama_cpp.llama_load_model_from_file(
            self.params.model.encode("utf8"), self.lparams
        )

        # Context Params.
        self.cparams = llama_cpp.llama_context_default_params()

        self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.cparams)
        if not self.ctx:
            raise RuntimeError(f"error: failed to load model '{self.params.model}'")

        if self.params.ignore_eos:
            self.params.logit_bias[llama_cpp.llama_token_eos()] = -float("inf")

        if len(self.params.lora_adapter) > 0:
            if (
                llama_cpp.llama_apply_lora_from_file(
                    self.ctx,
                    self.params.lora_adapter.encode("utf8"),
                    (
                        self.params.lora_base.encode("utf8")
                        if len(self.params.lora_base) > 0
                        else None
                    ),
                    self.params.n_threads,
                )
                != 0
            ):
                print("error: failed to apply lora adapter")
                return

        print(file=sys.stderr)
        print(
            f"system_info: n_threads = {self.params.n_threads} / {cpu_count()} \
| {llama_cpp.llama_print_system_info().decode('utf8')}",
            file=sys.stderr,
        )

        # determine the required inference memory per token:
        if self.params.mem_test:
            tmp = [0, 1, 2, 3]
            llama_cpp.llama_eval(
                self.ctx,
                (llama_cpp.c_int * len(tmp))(*tmp),
                len(tmp),
                0,
                self.n_threads,
            )
            llama_cpp.llama_print_timings(self.ctx)
            self.exit()
            return

        # create internal context
        self.n_ctx = llama_cpp.llama_n_ctx(self.ctx)

        # Add a space in front of the first character to match OG llama tokenizer behavior
        self.params.prompt = " " + self.params.prompt

        # Load prompt file
        if self.params.file:
            with open(self.params.file) as f:
                self.params.prompt = f.read()

        self.session_tokens: list[llama_cpp.llama_token] = []
        if len(self.params.path_session) > 0:
            print(
                f"attempting to load saved session from '{self.params.path_session}'",
                file=sys.stderr,
            )

            if path.exists(self.params.path_session):
                _session_tokens = (llama_cpp.llama_token * (self.params.n_ctx))()
                _n_token_count_out = llama_cpp.c_size_t()
                if (
                    llama_cpp.llama_load_session_file(
                        self.ctx,
                        self.params.path_session.encode("utf8"),
                        _session_tokens,
                        self.params.n_ctx,
                        ctypes.byref(_n_token_count_out),
                    )
                    != 1
                ):
                    print(
                        f"error: failed to load session file '{self.params.path_session}'",
                        file=sys.stderr,
                    )
                    return
                _n_token_count_out = _n_token_count_out.value
                self.session_tokens = _session_tokens[:_n_token_count_out]
                print(
                    f"loaded a session with prompt size of {_n_token_count_out} tokens",
                    file=sys.stderr,
                )
            else:
                print(f"session file does not exist, will create", file=sys.stderr)

        # tokenize the prompt
        self.embd = []
        self.embd_inp = self._tokenize(self.params.prompt)

        if len(self.embd_inp) > self.n_ctx - 4:
            raise RuntimeError(
                f"error: prompt is too long ({len(self.embd_inp)} tokens, max {self.params.n_ctx - 4})"
            )

        # debug message about similarity of saved session, if applicable
        self.n_matching_session_tokens = 0
        if len(self.session_tokens) > 0:
            for id in self.session_tokens:
                if (
                    self.n_matching_session_tokens >= len(self.embd_inp)
                    or id != self.embd_inp[self.n_matching_session_tokens]
                ):
                    break
                self.n_matching_session_tokens += 1

            if self.n_matching_session_tokens >= len(self.embd_inp):
                print(f"session file has exact match for prompt!")
            elif self.n_matching_session_tokens < (len(self.embd_inp) / 2):
                print(
                    f"warning: session file has low similarity to prompt ({self.n_matching_session_tokens} / {len(self.embd_inp)} tokens); will mostly be reevaluated"
                )
            else:
                print(
                    f"session file matches {self.n_matching_session_tokens} / {len(self.embd_inp)} tokens of prompt"
                )

        self.need_to_save_session = len(
            self.params.path_session
        ) > 0 and self.n_matching_session_tokens < (len(self.embd_inp) * 3 / 4)

        # number of tokens to keep when resetting context
        if (
            self.params.n_keep < 0
            or self.params.n_keep > len(self.embd_inp)
            or self.params.instruct
        ):
            self.params.n_keep = len(self.embd_inp)

        self.inp_prefix = self._tokenize(self.params.instruct_inp_prefix)
        self.inp_suffix = self._tokenize(self.params.instruct_inp_suffix, False)

        # in instruct mode, we inject a prefix and a suffix to each input by the user
        self.antiecho = None
        if self.params.instruct:
            self.params.interactive_start = True
            _ptn = self._tokenize(self.params.instruct_inp_prefix.strip(), False)
            self.first_antiprompt.append(_ptn)
            self.antiecho = util.IterSearch(_ptn)

        # enable interactive mode if reverse prompt or interactive start is specified
        if len(self.params.antiprompt) != 0 or self.params.interactive_start:
            self.params.interactive = True

        # determine newline token
        self.llama_token_newline = self._tokenize("\n", False)
        self.llama_token_eot = self._tokenize(" [end of text]\n", False)

        if self.params.verbose_prompt:
            print(
                f"""
prompt: '{self.params.prompt}'
number of tokens in prompt = {len(self.embd_inp)}""",
                file=sys.stderr,
            )

            for i in range(len(self.embd_inp)):
                print(
                    f"{self.embd_inp[i]} -> '{self.token_to_str(self.embd_inp[i])}'",
                    file=sys.stderr,
                )

            if self.params.n_keep > 0:
                print("static prompt based on n_keep: '")
                for i in range(self.params.n_keep):
                    print(self.token_to_str(self.embd_inp[i]), file=sys.stderr)
                print("'", file=sys.stderr)
            print(file=sys.stderr)

        if self.params.interactive:
            print("interactive mode on.", file=sys.stderr)

            if len(self.params.antiprompt) > 0:
                for antiprompt in self.params.antiprompt:
                    print(f"Reverse prompt: '{antiprompt}'", file=sys.stderr)

            if len(self.params.input_prefix) > 0:
                print(f"Input prefix: '{self.params.input_prefix}'", file=sys.stderr)

        print(
            f"""sampling: repeat_last_n = {self.params.repeat_last_n},\
repeat_penalty = {self.params.repeat_penalty},\
presence_penalty = {self.params.presence_penalty},\
frequency_penalty = {self.params.frequency_penalty},\
top_k = {self.params.top_k},\
tfs_z = {self.params.tfs_z},\
top_p = {self.params.top_p},\
typical_p = {self.params.typical_p},\
temp = {self.params.temp},\
mirostat = {self.params.mirostat},\
mirostat_lr = {self.params.mirostat_eta},\
mirostat_ent = {self.params.mirostat_tau},\

generate: n_ctx = {self.n_ctx},\
n_batch = {self.params.n_batch},\
n_predict = {self.params.n_predict},\
n_keep = {self.params.n_keep}

""",
            file=sys.stderr,
        )

        # determine antiprompt tokens
        for i in self.params.antiprompt:
            self.first_antiprompt.append(self._tokenize(i, False))

        self.last_n_tokens = [0] * self.n_ctx  # TODO: deque doesnt support slices

        if params.interactive:
            print(
                """== Running in interactive mode. ==
 - Press Ctrl+C to interject at any time.
 - Press Return to return control to LLaMa.
 - If you want to submit another line, end your input in '\\'.

""",
                file=sys.stderr,
            )
        self.set_color(util.CONSOLE_COLOR_PROMPT)

    # tokenize a prompt
    def _tokenize(self, prompt, bos=True):
        _arr = (llama_cpp.llama_token * ((len(prompt) + 1) * 4))()
        _n = llama_cpp.llama_tokenize(
            self.model,
            prompt.encode("utf8", errors="ignore"),
            len(prompt),
            _arr,
            len(_arr),
            bos,
            False,
        )
        return _arr[:_n]

    def set_color(self, c):
        if self.params.use_color:
            print(c, end="")

    def use_antiprompt(self):
        return len(self.first_antiprompt) > 0

    # generate tokens
    def generate(self):
        while (
            self.remaining_tokens > 0
            or self.params.interactive
            or self.params.n_predict == -1
        ):
            # predict
            if len(self.embd) > 0:
                # infinite text generation via context swapping
                # if we run out of context:
                # - take the n_keep first tokens from the original prompt (via n_past)
                # - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch
                if self.n_past + len(self.embd) > self.n_ctx:
                    n_left = self.n_past - self.params.n_keep
                    self.n_past = self.params.n_keep

                    # insert n_left/2 tokens at the start of embd from last_n_tokens
                    _insert = self.last_n_tokens[
                        self.n_ctx - int(n_left / 2) - len(self.embd) : -len(self.embd)
                    ]
                    self.embd = _insert + self.embd
                    self.params.path_session = ""

                # try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
                if self.n_session_consumed < len(self.session_tokens):
                    for i in range(len(self.embd)):
                        if self.embd[i] != self.session_tokens[self.n_session_consumed]:
                            self.session_tokens = self.session_tokens[
                                : self.n_session_consumed
                            ]
                            break

                        self.n_past += 1
                        self.n_session_consumed += 1

                        if self.n_session_consumed >= len(self.session_tokens):
                            i += 1
                            break

                    if i > 0:
                        self.embd = self.embd[i:]

                # evaluate tokens in batches
                # embd is typically prepared beforehand to fit within a batch, but not always
                # TODO BUG: The batching code causes nonsensical generation
                """for i in range(0, len(self.embd), self.params.n_batch):
					n_eval = self.params.n_batch
					_arr = (llama_cpp.llama_token * n_eval)(*self.embd[i:i + n_eval])
					if llama_cpp.llama_eval(self.ctx, _arr, n_eval, self.n_past, self.params.n_threads) != 0:
						print(f"failed to eval")
						return
					
					self.n_past += n_eval"""

                if (
                    llama_cpp.llama_eval(
                        self.ctx,
                        (llama_cpp.llama_token * len(self.embd))(*self.embd),
                        len(self.embd),
                        self.n_past,
                    )
                    != 0
                ):
                    raise Exception("Failed to llama_eval!")

                if len(self.embd) > 0 and len(self.params.path_session) > 0:
                    self.session_tokens.extend(self.embd)
                    self.n_session_consumed = len(self.session_tokens)

            self.n_past += len(self.embd)
            self.embd = []
            if len(self.embd_inp) <= self.input_consumed:  # && !is_interacting
                # out of user input, sample next token
                top_k = (
                    llama_cpp.llama_n_vocab(self.ctx)
                    if self.params.top_k <= 0
                    else self.params.top_k
                )
                repeat_last_n = (
                    self.n_ctx
                    if self.params.repeat_last_n < 0
                    else self.params.repeat_last_n
                )

                # optionally save the session on first sample (for faster prompt loading next time)
                if len(self.params.path_session) > 0 and self.need_to_save_session:
                    self.need_to_save_session = False
                    llama_cpp.llama_save_session_file(
                        self.ctx,
                        self.params.path_session.encode("utf8"),
                        (llama_cpp.llama_token * len(self.session_tokens))(
                            *self.session_tokens
                        ),
                        len(self.session_tokens),
                    )

                id = 0

                logits = llama_cpp.llama_get_logits(self.ctx)
                n_vocab = llama_cpp.llama_n_vocab(self.model)

                # Apply params.logit_bias map
                for key, value in self.params.logit_bias.items():
                    logits[key] += value

                _arr = (llama_cpp.llama_token_data * n_vocab)(
                    *[
                        llama_cpp.llama_token_data(token_id, logits[token_id], 0.0)
                        for token_id in range(n_vocab)
                    ]
                )
                candidates_p = llama_cpp.ctypes.pointer(
                    llama_cpp.llama_token_data_array(_arr, len(_arr), False)
                )

                # Apply penalties
                nl_logit = logits[llama_cpp.llama_token_nl(self.ctx)]
                last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx)

                _arr = (llama_cpp.llama_token * last_n_repeat)(
                    *self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat :]
                )
                llama_cpp.llama_sample_repetition_penalties(
                    ctx=self.ctx,
                    candidates=candidates_p,
                    last_tokens_data=_arr,
                    penalty_last_n=last_n_repeat,
                    penalty_repeat=llama_cpp.c_float(self.params.repeat_penalty),
                    penalty_freq=llama_cpp.c_float(self.params.frequency_penalty),
                    penalty_present=llama_cpp.c_float(self.params.presence_penalty),
                )

                # NOT PRESENT IN CURRENT VERSION ?
                # llama_cpp.llama_sample_frequency_and_presence_penalti(self.ctx, candidates_p,
                # 	_arr,
                # 	last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty))

                if not self.params.penalize_nl:
                    logits[llama_cpp.llama_token_nl()] = nl_logit

                if self.params.temp <= 0:
                    # Greedy sampling
                    id = llama_cpp.llama_sample_token_greedy(self.ctx, candidates_p)
                else:
                    if self.params.mirostat == 1:
                        mirostat_mu = 2.0 * self.params.mirostat_tau
                        mirostat_m = 100
                        llama_cpp.llama_sample_temperature(
                            self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)
                        )
                        id = llama_cpp.llama_sample_token_mirostat(
                            self.ctx,
                            candidates_p,
                            llama_cpp.c_float(self.params.mirostat_tau),
                            llama_cpp.c_float(self.params.mirostat_eta),
                            llama_cpp.c_int(mirostat_m),
                            llama_cpp.c_float(mirostat_mu),
                        )
                    elif self.params.mirostat == 2:
                        mirostat_mu = 2.0 * self.params.mirostat_tau
                        llama_cpp.llama_sample_temperature(
                            self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)
                        )
                        id = llama_cpp.llama_sample_token_mirostat_v2(
                            self.ctx,
                            candidates_p,
                            llama_cpp.c_float(self.params.mirostat_tau),
                            llama_cpp.c_float(self.params.mirostat_eta),
                            llama_cpp.c_float(mirostat_mu),
                        )
                    else:
                        # Temperature sampling
                        llama_cpp.llama_sample_top_k(
                            self.ctx,
                            candidates_p,
                            top_k,
                            min_keep=llama_cpp.c_size_t(1),
                        )
                        llama_cpp.llama_sample_tail_free(
                            self.ctx,
                            candidates_p,
                            llama_cpp.c_float(self.params.tfs_z),
                            min_keep=llama_cpp.c_size_t(1),
                        )
                        llama_cpp.llama_sample_typical(
                            self.ctx,
                            candidates_p,
                            llama_cpp.c_float(self.params.typical_p),
                            min_keep=llama_cpp.c_size_t(1),
                        )
                        llama_cpp.llama_sample_top_p(
                            self.ctx,
                            candidates_p,
                            llama_cpp.c_float(self.params.top_p),
                            min_keep=llama_cpp.c_size_t(1),
                        )
                        llama_cpp.llama_sample_temperature(
                            self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)
                        )
                        id = llama_cpp.llama_sample_token(self.ctx, candidates_p)
                # print("`{}`".format(candidates_p.size))

                self.last_n_tokens.pop(0)
                self.last_n_tokens.append(id)

                # replace end of text token with newline token when in interactive mode
                if (
                    id == llama_cpp.llama_token_eos(self.ctx)
                    and self.params.interactive
                    and not self.params.instruct
                ):
                    id = self.llama_token_newline[0]
                    self.embd.append(id)
                    if self.use_antiprompt():
                        # tokenize and inject first reverse prompt
                        self.embd_inp += self.first_antiprompt[0]
                        for id in self.first_antiprompt[0]:
                            self.embd.append(id)
                else:
                    # add it to the context
                    self.embd.append(id)

                # echo this to console
                self.output_echo = True

                # decrement remaining sampling budget
                self.remaining_tokens -= 1
            else:
                # output to console if input echo is on
                self.output_echo = self.params.input_echo

                # some user input remains from prompt or interaction, forward it to processing
                while len(self.embd_inp) > self.input_consumed:
                    self.embd.append(self.embd_inp[self.input_consumed])
                    self.last_n_tokens.pop(0)
                    self.last_n_tokens.append(self.embd_inp[self.input_consumed])
                    self.input_consumed += 1
                    if len(self.embd) >= self.params.n_batch:
                        break

            # display tokens
            if self.output_echo:
                for id in self.embd:
                    if self.antiecho != None:
                        for r in self.antiecho(id):
                            yield r
                    else:
                        yield id

            # reset color to default if we there is no pending user input
            if self.params.input_echo and len(self.embd_inp) == self.input_consumed:
                self.set_color(util.CONSOLE_COLOR_DEFAULT)

            if self.params.interactive and len(self.embd_inp) <= self.input_consumed:
                # if antiprompt is present, stop
                if self.use_antiprompt():
                    if True in [
                        i == self.last_n_tokens[-len(i) :]
                        for i in self.first_antiprompt
                    ]:
                        break

                # if we are using instruction mode, and we have processed the initial prompt
                if self.params.interactive_start:
                    break

            # end of text token
            if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos(
                self.ctx
            ):
                if not self.params.instruct:
                    for i in self.llama_token_eot:
                        yield i
                    break

            # respect n_predict even if antiprompt is present
            if (
                self.params.interactive
                and self.remaining_tokens <= 0
                and self.params.n_predict != -1
            ):
                # If we arent in instruction mode, fix the current generation by appending the antiprompt.
                # Makes it so if chat ends prematurely you dont append the AI's text etc.
                if not self.params.instruct:
                    self.embd_inp += self.first_antiprompt[0]
                self.n_remain = self.params.n_predict
                break

        self.params.interactive_start = False

    def __enter__(self):
        return self

    def __exit__(self, type, value, tb):
        self.exit()

    def exit(self):
        llama_cpp.llama_free(self.ctx)
        self.set_color(util.CONSOLE_COLOR_DEFAULT)

    def token_to_str(self, token_id: int) -> bytes:
        size = 32
        buffer = (ctypes.c_char * size)()
        n = llama_cpp.llama_token_to_piece(
            self.model, llama_cpp.llama_token(token_id), buffer, size
        )
        assert n <= size
        return bytes(buffer[:n])

    # return past text
    def past(self):
        for id in self.last_n_tokens[-self.n_past :]:
            yield self.token_to_str(id).decode("utf8", errors="ignore")

    # write input
    def input(self, prompt: str):
        if (
            self.params.instruct
            and self.last_n_tokens[-len(self.inp_prefix) :] != self.inp_prefix
        ):
            self.embd_inp += self.inp_prefix
        self.embd_inp += self._tokenize(prompt)
        if self.params.instruct:
            self.embd_inp += self.inp_suffix

    # write output
    def output(self):
        self.remaining_tokens = self.params.n_predict
        for id in self.generate():
            cur_char = self.token_to_str(id)

            # Add remainder of missing bytes
            if None in self.multibyte_fix:
                self.multibyte_fix[self.multibyte_fix.index(None)] = cur_char

            # Return completed utf char
            if len(self.multibyte_fix) > 0 and not None in self.multibyte_fix:
                yield (b"".join(self.multibyte_fix)).decode("utf8")
                self.multibyte_fix = []
                continue

            # Contains multi-byte UTF8
            for num, pattern in [(2, 192), (3, 224), (4, 240)]:
                # Bitwise AND check
                if pattern & int.from_bytes(cur_char, "little") == pattern:
                    self.multibyte_fix = [cur_char] + ([None] * (num - 1))

            # Stop incomplete bytes from passing
            if len(self.multibyte_fix) > 0:
                continue

            yield cur_char.decode("utf8")

    # read user input
    def read_input(self):
        out = ""
        while (t := input()).endswith("\\"):
            out += t[:-1] + "\n"
        return out + t + "\n"

    # interactive mode
    def interact(self):
        for i in self.output():
            print(i, end="", flush=True)
        self.params.input_echo = False

        # Using string instead of tokens to check for antiprompt,
        # It is more reliable than tokens for interactive mode.
        generated_str = ""
        while self.params.interactive:
            self.set_color(util.CONSOLE_COLOR_USER_INPUT)
            if self.params.instruct:
                print("\n> ", end="")
                self.input(self.read_input())
            else:
                print(self.params.input_prefix, end="")
                self.input(
                    f"{self.params.input_prefix}{self.read_input()}{self.params.input_suffix}"
                )
                print(self.params.input_suffix, end="")
            self.set_color(util.CONSOLE_COLOR_DEFAULT)

            try:
                for i in self.output():
                    print(i, end="", flush=True)
                    generated_str += i
                    for ap in self.params.antiprompt:
                        if generated_str.endswith(ap):
                            raise KeyboardInterrupt
            except KeyboardInterrupt:
                self.set_color(util.CONSOLE_COLOR_DEFAULT)
                if not self.params.instruct:
                    print(self.params.fix_prefix, end="")
                    self.input(self.params.fix_prefix)


if __name__ == "__main__":
    from datetime import datetime

    USER_NAME = "User"
    AI_NAME = "ChatLLaMa"

    time_now = datetime.now()
    prompt = f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}.
{AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}’s requests immediately and with details and precision.
Transcript below contains only the recorded dialog between two, without any annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other.
The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long.
The transcript only includes text, it does not include markup like HTML and Markdown.

{USER_NAME}: Hello, {AI_NAME}!
{AI_NAME}: Hello {USER_NAME}! How may I help you today?
{USER_NAME}: What time is it?
{AI_NAME}: It is {time_now.strftime("%H:%M")}.
{USER_NAME}: What year is it?
{AI_NAME}: We are in {time_now.strftime("%Y")}.
{USER_NAME}: What is a cat?
{AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
{USER_NAME}: Name a color.
{AI_NAME}: Blue
{USER_NAME}:   """

    params = gpt_params_parse()
    if params.prompt is None and params.file is None:
        params.prompt = prompt

    with LLaMAInteract(params) as m:
        m.interact()