diff --git a/README.md b/README.md index faef6ad..40b160d 100644 --- a/README.md +++ b/README.md @@ -1 +1,37 @@ -# mpt-30B-inference \ No newline at end of file +# MPT 30B inference code using CPU + +Run inference on the latest MPT-30B model using your CPU. + +![Inference Demo](media/inference-demo.mp4) + +## Requirements + +I recommend you use docker for this model, it will make everything easier for you. Tested on cuda-11.8.0 with AMD Epyc CPU. + +## Setup + +First create a venv. + +```sh +python -m venv env && source env/bin/activate +``` + +Next install dependencies. + +```sh +pip install -r requirements.txt +``` + +Next download the quantized model weights (about 19GB). + +```sh +python download_model.py +``` + +Ready to rock, run inference. + +```sh +python inference.py +``` + +Next modify inference script prompt and generation parameters. \ No newline at end of file diff --git a/download_model.py b/download_model.py new file mode 100644 index 0000000..7601eb2 --- /dev/null +++ b/download_model.py @@ -0,0 +1,18 @@ +import os +from huggingface_hub import hf_hub_download + + +def download_mpt_quant(destination_folder): + local_path = os.path.relpath(destination_folder) + return hf_hub_download( + repo_id="TheBloke/mpt-30B-chat-GGML", + filename="mpt-30b-chat.ggmlv0.q4_1.bin", + cache_dir=local_path, + ) + + +if __name__ == "__main__": + """full url: https://huggingface.co/TheBloke/mpt-30B-chat-GGML/blob/main/mpt-30b-chat.ggmlv0.q4_1.bin""" + + destination_folder = "models" + download_mpt_quant(destination_folder) diff --git a/inference.py b/inference.py new file mode 100644 index 0000000..e8899c8 --- /dev/null +++ b/inference.py @@ -0,0 +1,68 @@ +from ctransformers import AutoModelForCausalLM, AutoConfig + + +def format_prompt(system_prompt, user_prompt): + """format prompt based on: https://huggingface.co/spaces/mosaicml/mpt-30b-chat/blob/main/app.py""" + + system_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n" + user_prompt = f"<|im_start|>user\n{user_prompt}<|im_end|>\n" + assistant_prompt = f"<|im_start|>assistant\n" + + return f"{system_prompt}{user_prompt}{assistant_prompt}" + + +def format_output(user_prompt): + return f"[user]: {user_prompt}\n[assistant]:" + + +def generate(llm, system_prompt, user_prompt): + """run model inference, will return a Generator if streaming is true""" + + return llm( + format_prompt( + system_prompt, + user_prompt, + ), + temperature=0.2, + top_k=0, + top_p=0.9, + repetition_penalty=1.0, + max_new_tokens=512, # adjust as needed + seed=42, + reset=True, # reset history (cache) + stream=True, # streaming per word/token + threads=24, # adjust for your CPU + stop=["<|im_end|>", "|<"], + ) + + +if __name__ == "__main__": + config = AutoConfig.from_pretrained("mosaicml/mpt-30b-chat", context_length=8192) + llm = AutoModelForCausalLM.from_pretrained( + "models/mpt-30b-chat.ggmlv0.q4_1.bin", + model_type="mpt", + config=config, + ) + + print(config) + system_prompt = "A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers." + + user_prompts = [ + "What is 2 + 2?", + "What is 12 + 2?", + "What is 5 + 7?", + "What is 3 * 2?", + "What is 4 / 2?", + "Who was the first president of the US?", + "Can humans ever set foot on mars?", + ] + + for user_prompt in user_prompts: + generator = generate(llm, system_prompt, user_prompt) + print(format_output(user_prompt), end=" ", flush=True) + for word in generator: + print(word, end="", flush=True) + + # print empty line + print("") + print(80 * "=") diff --git a/media/inference-demo.mp4 b/media/inference-demo.mp4 new file mode 100644 index 0000000..0242479 Binary files /dev/null and b/media/inference-demo.mp4 differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..400d30e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +ctransformers==0.2.10 +transformers==4.30.2 \ No newline at end of file