initial commit

2023-06-26 05:36:27 +00:00 · 2023-06-26 05:36:27 +00:00 · dd791e1de0
commit dd791e1de0
parent ca435c46b0
5 changed files with 125 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -1 +1,37 @@
-# mpt-30B-inference
+# MPT 30B inference code using CPU
 Run inference on the latest MPT-30B model using your CPU.
 ![Inference Demo](media/inference-demo.mp4)
 ## Requirements
 I recommend you use docker for this model, it will make everything easier for you. Tested on cuda-11.8.0 with AMD Epyc CPU.
 ## Setup
 First create a venv.
 ```sh
 python -m venv env && source env/bin/activate
 ```
 Next install dependencies.
 ```sh
 pip install -r requirements.txt
 ```
 Next download the quantized model weights (about 19GB).
 ```sh
 python download_model.py
 ```
 Ready to rock, run inference.
 ```sh
 python inference.py
 ```
 Next modify inference script prompt and generation parameters.
--- a/download_model.py
+++ b/download_model.py
@ -0,0 +1,18 @@
 import os
 from huggingface_hub import hf_hub_download
 def download_mpt_quant(destination_folder):
    local_path = os.path.relpath(destination_folder)
    return hf_hub_download(
        repo_id="TheBloke/mpt-30B-chat-GGML",
        filename="mpt-30b-chat.ggmlv0.q4_1.bin",
        cache_dir=local_path,
    )
 if __name__ == "__main__":
    """full url: https://huggingface.co/TheBloke/mpt-30B-chat-GGML/blob/main/mpt-30b-chat.ggmlv0.q4_1.bin"""
    destination_folder = "models"
    download_mpt_quant(destination_folder)
--- a/inference.py
+++ b/inference.py
@ -0,0 +1,68 @@
 from ctransformers import AutoModelForCausalLM, AutoConfig
 def format_prompt(system_prompt, user_prompt):
    """format prompt based on: https://huggingface.co/spaces/mosaicml/mpt-30b-chat/blob/main/app.py"""
    system_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
    user_prompt = f"<|im_start|>user\n{user_prompt}<|im_end|>\n"
    assistant_prompt = f"<|im_start|>assistant\n"
    return f"{system_prompt}{user_prompt}{assistant_prompt}"
 def format_output(user_prompt):
    return f"[user]: {user_prompt}\n[assistant]:"
 def generate(llm, system_prompt, user_prompt):
    """run model inference, will return a Generator if streaming is true"""
    return llm(
        format_prompt(
            system_prompt,
            user_prompt,
        ),
        temperature=0.2,
        top_k=0,
        top_p=0.9,
        repetition_penalty=1.0,
        max_new_tokens=512,  # adjust as needed
        seed=42,
        reset=True,  # reset history (cache)
        stream=True,  # streaming per word/token
        threads=24,  # adjust for your CPU
        stop=["<|im_end|>", "|<"],
    )
 if __name__ == "__main__":
    config = AutoConfig.from_pretrained("mosaicml/mpt-30b-chat", context_length=8192)
    llm = AutoModelForCausalLM.from_pretrained(
        "models/mpt-30b-chat.ggmlv0.q4_1.bin",
        model_type="mpt",
        config=config,
    )
    print(config)
    system_prompt = "A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers."
    user_prompts = [
        "What is 2 + 2?",
        "What is 12 + 2?",
        "What is 5 + 7?",
        "What is 3 * 2?",
        "What is 4 / 2?",
        "Who was the first president of the US?",
        "Can humans ever set foot on mars?",
    ]
    for user_prompt in user_prompts:
        generator = generate(llm, system_prompt, user_prompt)
        print(format_output(user_prompt), end=" ", flush=True)
        for word in generator:
            print(word, end="", flush=True)
        # print empty line
        print("")
        print(80 * "=")
--- a/media/inference-demo.mp4
+++ b/media/inference-demo.mp4
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
 ctransformers==0.2.10
 transformers==4.30.2
		`@ -0,0 +1,2 @@`
							`ctransformers==0.2.10`
							`transformers==4.30.2`