Update README.md
Browse files
README.md
CHANGED
@@ -77,20 +77,21 @@ This model can be deployed efficiently using the [vLLM](https://docs.vllm.ai/en/
|
|
77 |
|
78 |
```python
|
79 |
from vllm import LLM, SamplingParams
|
80 |
-
from transformers import
|
81 |
|
82 |
model_id = "RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic"
|
83 |
number_gpus = 1
|
84 |
|
85 |
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
|
|
|
86 |
|
87 |
-
|
88 |
|
89 |
-
|
90 |
|
91 |
llm = LLM(model=model_id, tensor_parallel_size=number_gpus)
|
92 |
|
93 |
-
outputs = llm.generate(
|
94 |
|
95 |
generated_text = outputs[0].outputs[0].text
|
96 |
print(generated_text)
|
|
|
77 |
|
78 |
```python
|
79 |
from vllm import LLM, SamplingParams
|
80 |
+
from transformers import AutoProcessor
|
81 |
|
82 |
model_id = "RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic"
|
83 |
number_gpus = 1
|
84 |
|
85 |
sampling_params = SamplingParams(temperature=0.7, top_p=0.8, max_tokens=256)
|
86 |
+
processor = AutoProcessor.from_pretrained(model_id)
|
87 |
|
88 |
+
messages = [{"role": "user", "content": "Give me a short introduction to large language model."}]
|
89 |
|
90 |
+
prompts = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
|
91 |
|
92 |
llm = LLM(model=model_id, tensor_parallel_size=number_gpus)
|
93 |
|
94 |
+
outputs = llm.generate(prompts, sampling_params)
|
95 |
|
96 |
generated_text = outputs[0].outputs[0].text
|
97 |
print(generated_text)
|