initial commit

2023-06-26 05:36:27 +00:00 · 2023-06-26 05:36:27 +00:00 · dd791e1de0
commit dd791e1de0
parent ca435c46b0
5 changed files with 125 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -1 +1,37 @@
-# mpt-30B-inference
+# MPT 30B inference code using CPU
+
+Run inference on the latest MPT-30B model using your CPU.
+
+![Inference Demo](media/inference-demo.mp4)
+
+## Requirements
+
+I recommend you use docker for this model, it will make everything easier for you. Tested on cuda-11.8.0 with AMD Epyc CPU.
+
+## Setup
+
+First create a venv.
+
+```sh
+python -m venv env && source env/bin/activate
+```
+
+Next install dependencies.
+
+```sh
+pip install -r requirements.txt
+```
+
+Next download the quantized model weights (about 19GB).
+
+```sh
+python download_model.py
+```
+
+Ready to rock, run inference.
+
+```sh
+python inference.py
+```
+
+Next modify inference script prompt and generation parameters.
--- a/download_model.py
+++ b/download_model.py
@ -0,0 +1,18 @@
+import os
+from huggingface_hub import hf_hub_download
+
+
+def download_mpt_quant(destination_folder):
+    local_path = os.path.relpath(destination_folder)
+    return hf_hub_download(
+        repo_id="TheBloke/mpt-30B-chat-GGML",
+        filename="mpt-30b-chat.ggmlv0.q4_1.bin",
+        cache_dir=local_path,
+    )
+
+
+if __name__ == "__main__":
+    """full url: https://huggingface.co/TheBloke/mpt-30B-chat-GGML/blob/main/mpt-30b-chat.ggmlv0.q4_1.bin"""
+
+    destination_folder = "models"
+    download_mpt_quant(destination_folder)
--- a/inference.py
+++ b/inference.py
@ -0,0 +1,68 @@
+from ctransformers import AutoModelForCausalLM, AutoConfig
+
+
+def format_prompt(system_prompt, user_prompt):
+    """format prompt based on: https://huggingface.co/spaces/mosaicml/mpt-30b-chat/blob/main/app.py"""
+
+    system_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
+    user_prompt = f"<|im_start|>user\n{user_prompt}<|im_end|>\n"
+    assistant_prompt = f"<|im_start|>assistant\n"
+
+    return f"{system_prompt}{user_prompt}{assistant_prompt}"
+
+
+def format_output(user_prompt):
+    return f"[user]: {user_prompt}\n[assistant]:"
+
+
+def generate(llm, system_prompt, user_prompt):
+    """run model inference, will return a Generator if streaming is true"""
+
+    return llm(
+        format_prompt(
+            system_prompt,
+            user_prompt,
+        ),
+        temperature=0.2,
+        top_k=0,
+        top_p=0.9,
+        repetition_penalty=1.0,
+        max_new_tokens=512,  # adjust as needed
+        seed=42,
+        reset=True,  # reset history (cache)
+        stream=True,  # streaming per word/token
+        threads=24,  # adjust for your CPU
+        stop=["<|im_end|>", "|<"],
+    )
+
+
+if __name__ == "__main__":
+    config = AutoConfig.from_pretrained("mosaicml/mpt-30b-chat", context_length=8192)
+    llm = AutoModelForCausalLM.from_pretrained(
+        "models/mpt-30b-chat.ggmlv0.q4_1.bin",
+        model_type="mpt",
+        config=config,
+    )
+
+    print(config)
+    system_prompt = "A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers."
+
+    user_prompts = [
+        "What is 2 + 2?",
+        "What is 12 + 2?",
+        "What is 5 + 7?",
+        "What is 3 * 2?",
+        "What is 4 / 2?",
+        "Who was the first president of the US?",
+        "Can humans ever set foot on mars?",
+    ]
+
+    for user_prompt in user_prompts:
+        generator = generate(llm, system_prompt, user_prompt)
+        print(format_output(user_prompt), end=" ", flush=True)
+        for word in generator:
+            print(word, end="", flush=True)
+
+        # print empty line
+        print("")
+        print(80 * "=")
--- a/media/inference-demo.mp4
+++ b/media/inference-demo.mp4
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
+ctransformers==0.2.10
+transformers==4.30.2