initial commit
This commit is contained in:
parent
ca435c46b0
commit
dd791e1de0
5 changed files with 125 additions and 1 deletions
38
README.md
38
README.md
|
|
@ -1 +1,37 @@
|
|||
# mpt-30B-inference
|
||||
# MPT 30B inference code using CPU
|
||||
|
||||
Run inference on the latest MPT-30B model using your CPU.
|
||||
|
||||

|
||||
|
||||
## Requirements
|
||||
|
||||
I recommend you use docker for this model, it will make everything easier for you. Tested on cuda-11.8.0 with AMD Epyc CPU.
|
||||
|
||||
## Setup
|
||||
|
||||
First create a venv.
|
||||
|
||||
```sh
|
||||
python -m venv env && source env/bin/activate
|
||||
```
|
||||
|
||||
Next install dependencies.
|
||||
|
||||
```sh
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
Next download the quantized model weights (about 19GB).
|
||||
|
||||
```sh
|
||||
python download_model.py
|
||||
```
|
||||
|
||||
Ready to rock, run inference.
|
||||
|
||||
```sh
|
||||
python inference.py
|
||||
```
|
||||
|
||||
Next modify inference script prompt and generation parameters.
|
||||
18
download_model.py
Normal file
18
download_model.py
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
import os
|
||||
from huggingface_hub import hf_hub_download
|
||||
|
||||
|
||||
def download_mpt_quant(destination_folder):
|
||||
local_path = os.path.relpath(destination_folder)
|
||||
return hf_hub_download(
|
||||
repo_id="TheBloke/mpt-30B-chat-GGML",
|
||||
filename="mpt-30b-chat.ggmlv0.q4_1.bin",
|
||||
cache_dir=local_path,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""full url: https://huggingface.co/TheBloke/mpt-30B-chat-GGML/blob/main/mpt-30b-chat.ggmlv0.q4_1.bin"""
|
||||
|
||||
destination_folder = "models"
|
||||
download_mpt_quant(destination_folder)
|
||||
68
inference.py
Normal file
68
inference.py
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
from ctransformers import AutoModelForCausalLM, AutoConfig
|
||||
|
||||
|
||||
def format_prompt(system_prompt, user_prompt):
|
||||
"""format prompt based on: https://huggingface.co/spaces/mosaicml/mpt-30b-chat/blob/main/app.py"""
|
||||
|
||||
system_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
|
||||
user_prompt = f"<|im_start|>user\n{user_prompt}<|im_end|>\n"
|
||||
assistant_prompt = f"<|im_start|>assistant\n"
|
||||
|
||||
return f"{system_prompt}{user_prompt}{assistant_prompt}"
|
||||
|
||||
|
||||
def format_output(user_prompt):
|
||||
return f"[user]: {user_prompt}\n[assistant]:"
|
||||
|
||||
|
||||
def generate(llm, system_prompt, user_prompt):
|
||||
"""run model inference, will return a Generator if streaming is true"""
|
||||
|
||||
return llm(
|
||||
format_prompt(
|
||||
system_prompt,
|
||||
user_prompt,
|
||||
),
|
||||
temperature=0.2,
|
||||
top_k=0,
|
||||
top_p=0.9,
|
||||
repetition_penalty=1.0,
|
||||
max_new_tokens=512, # adjust as needed
|
||||
seed=42,
|
||||
reset=True, # reset history (cache)
|
||||
stream=True, # streaming per word/token
|
||||
threads=24, # adjust for your CPU
|
||||
stop=["<|im_end|>", "|<"],
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
config = AutoConfig.from_pretrained("mosaicml/mpt-30b-chat", context_length=8192)
|
||||
llm = AutoModelForCausalLM.from_pretrained(
|
||||
"models/mpt-30b-chat.ggmlv0.q4_1.bin",
|
||||
model_type="mpt",
|
||||
config=config,
|
||||
)
|
||||
|
||||
print(config)
|
||||
system_prompt = "A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers."
|
||||
|
||||
user_prompts = [
|
||||
"What is 2 + 2?",
|
||||
"What is 12 + 2?",
|
||||
"What is 5 + 7?",
|
||||
"What is 3 * 2?",
|
||||
"What is 4 / 2?",
|
||||
"Who was the first president of the US?",
|
||||
"Can humans ever set foot on mars?",
|
||||
]
|
||||
|
||||
for user_prompt in user_prompts:
|
||||
generator = generate(llm, system_prompt, user_prompt)
|
||||
print(format_output(user_prompt), end=" ", flush=True)
|
||||
for word in generator:
|
||||
print(word, end="", flush=True)
|
||||
|
||||
# print empty line
|
||||
print("")
|
||||
print(80 * "=")
|
||||
BIN
media/inference-demo.mp4
Normal file
BIN
media/inference-demo.mp4
Normal file
Binary file not shown.
2
requirements.txt
Normal file
2
requirements.txt
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
ctransformers==0.2.10
|
||||
transformers==4.30.2
|
||||
Loading…
Reference in a new issue