diff --git a/README.md b/README.md index 0f5f9b6..5115628 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ USERNAME = 'Steve' # create state-of-the-art chat bot myFancyChatBot = client('Meta-Llama-3-8B-Instruct.Q2_K.gguf', 'MaziyarPanahi/Meta-Llama-3-8B-Instruct-GGUF', - llama_version="llama-3") + chat_format="llama-3") myChatBot.setConfig(username=USERNAME) @@ -47,7 +47,7 @@ webUI(cl) - Has automatic fallbacks for different weight formats (e.g. GGML, GGUF, bin, ..) ## Base Requirements -- Python 3.11 +- Python >=3.10.12 - A system with a CPU (preferably Ryzen) and `>=16GB` RAM - Assumes drivers were correctly installed and GPU is detectable via rocm-smi, nvidia-smi etc. - A solid GPT chat requires `>=6GB` of RAM/vRAM depending on device. @@ -149,7 +149,6 @@ cl.chat( 'obedient' ], username='Human', - do_sample=False, temperature=0.8, repetition_penalty=1.1) ``` @@ -190,7 +189,6 @@ cl.setConfig( 'obedient' ], username='Human', - do_sample=True, temperature=0.8, repetition_penalty=1.1 ) @@ -212,7 +210,6 @@ cl.setConfig( 'obedient' ], username='Human', - do_sample=True, temperature=0.8, repetition_penalty=1.1 ) @@ -222,7 +219,6 @@ from blowtorch import webUI webUI(cl) ``` -## Expose Objects for your Chat @@ -231,40 +227,6 @@ webUI(cl) - -
-Command Line Inference (CLI) - ---- - -Pre-trained models like e.g. Llama2 can directly be ported from huggingface hub, and subsequently propagate inputs or inference, through the model. ``Note:`` that the inference method is the lowest level and will not pre- or post process, track the context. These steps will be considered with the ``contextInference`` method which is used by chat and webUI. - - -```python -from blowtorch import client - -AI = client(hugging_face_path='TheBloke/Llama-2-7B-Chat-GGML', device='cpu', model_type="llama") # model_type is transformer compliant arg -# start the command line interface for text interaction with some transformer.pipeline arguments -AI.cli(max_new_tokens=64, do_sample=True, temperature=0.8, repetition_penalty=1.2) -``` -```python -Human:special relativity -Llama-2-7B-Chat-GGML: [{'generated_text': "special relativity and the meaning of time\n\nTime and its relationship to space are fundamental concepts in physics. According to Newton's laws -of motion, time is a fixed quantity that moves along with space, yet according to Einstein's special relativity, time has no actual physical existence. This paradox has puzzled"}] -``` - -``` -Human: can you explain what a dejavu is? -Llama-2-7B-Chat-GGML: [{'generated_text': 'can you explain what a dejavu is?\n\nAnswer: A deja vu is a French term that refers to a feeling of familiarity or recognition that cannot be explained. It\'s the sensation of having already experienced an event, situation, or place, even though you know that you have not. Deja vu can also be described as a "'}] -``` - -The cli is a useful method is intended for testing forward-propagation but since it is not tracking context, or be reasonable (rather halucinating) or wrapped in tags, the output will simply be a random completion of the input. - -
- - - -
Chat API @@ -272,14 +234,14 @@ The cli is a useful method is intended for testing forward-propagation but since --- -The following is an example of loading a specific model file (from huggingface card) in **GGUF** format. This will be automatically loaded with [ctransformers](https://github.com/marella/ctransformers) in a single line. +The following is an example of loading quantization level, like a model file (from huggingface card) in **GGUF** format. If CPU is used, as GGML models are really well suited for CPU, it will be automatically loaded with [llama.cpp](https://github.com/marella/ctransformers) in a single line. blowtorch's chat method tracks the context, so the LLM can argue and even write code. For better output increase the max token size. ```python from blowtorch import client -client('llama-2-7b-chat.Q2_K.gguf', 'TheBloke/Llama-2-7B-Chat-GGUF', 'cpu', model_type="llama").chat(max_new_tokens=512, do_sample=False, temperature=0.8, repetition_penalty=1.1) +client('llama-2-7b-chat.Q2_K.gguf', 'TheBloke/Llama-2-7B-Chat-GGUF', 'cpu', model_type="llama").chat(max_new_tokens=512, temperature=0.8, repetition_penalty=1.1) ``` Human: please create a python script which loads a huggingface model into transformers. @@ -344,7 +306,6 @@ client('llama-2-7b-chat.Q2_K.gguf', ).chat( max_new_tokens=128, char_tags=['funnily impersonates Arnold Schwarzenegger', 'joking', 'randomly stating facts about his career', 'hectic'], - do_sample=False, temperature=0.8, repetition_penalty=1.1 ) ` @@ -405,7 +366,6 @@ cl.setConfig( max_new_tokens=128, scenario=myScenario, # <-- add the scenario to config instead of char_tags username='Pacino', - do_sample=True, temperature=0.85, repetition_penalty=1.15, top_p=0.95, @@ -439,7 +399,6 @@ cl.setConfig( 'obedient' ], username='Human', - do_sample=True, temperature=0.8, repetition_penalty=1.1 ) diff --git a/benchmark_llama3_cpu.py b/benchmark_llama3_cpu.py index de3a031..30d697b 100644 --- a/benchmark_llama3_cpu.py +++ b/benchmark_llama3_cpu.py @@ -11,6 +11,6 @@ name='Assistant', device='cpu', context_length = 6000, - llama_version="llama-3") + chat_format="llama-3") cl.bench(tokens=64) \ No newline at end of file diff --git a/examples/classification.py b/examples/classification.py index dee0dbb..d58f304 100644 --- a/examples/classification.py +++ b/examples/classification.py @@ -11,7 +11,7 @@ 'MaziyarPanahi/Meta-Llama-3-8B-Instruct-GGUF', name='Sunday', device='cpu', - llama_version="llama-3") + chat_format="llama-3") cl.setConfig( scenario=scenario, diff --git a/examples/llama3_GPT.py b/examples/llama3_GPT.py index 42d6fb8..7336c76 100644 --- a/examples/llama3_GPT.py +++ b/examples/llama3_GPT.py @@ -14,7 +14,7 @@ 'MaziyarPanahi/Meta-Llama-3-8B-Instruct-GGUF', name='Llama-GPT', device='cpu', - llama_version="llama-3") + chat_format="llama-3") cl.setConfig( char_tags=[