#!/usr/bin/env python3
import time
from vllm import LLM, SamplingParams

def main():
    # Hard-coded model and tensor parallel configuration.
    model_path = "miike-ai/qwen-14b-coder-fp8"
    tensor_parallel_size = 1

    # Define sampling parameters with an increased max_tokens and a stop string.
    sampling_params = SamplingParams(
        temperature=0.0,
        top_p=0.95,
        max_tokens=32000,       # Increase this to allow longer responses.
        stop=["\nUser:"],     # Stop when the model outputs a new user marker.
    )

    print(f"Loading model '{model_path}' ...")
    model = LLM(
        model=model_path,
        enforce_eager=True,
        dtype="auto",
        tensor_parallel_size=tensor_parallel_size,
    )
    print("Model loaded. You can now chat!")
    print("Type 'exit' or 'quit' to end the conversation.\n")

    conversation = ""
    while True:
        try:
            user_input = input("User: ").strip()
        except (KeyboardInterrupt, EOFError):
            print("\nExiting chat.")
            break

        if user_input.lower() in {"exit", "quit"}:
            print("Exiting chat.")
            break

        # Append the user's input to the conversation history.
        conversation += f"User: {user_input}\nBot: "
        print("Bot: ", end="", flush=True)

        # Generate a response using the conversation history and sampling parameters.
        response = model.generate(conversation, sampling_params=sampling_params)
        # Extract the generated reply.
        bot_reply = response[0].outputs[0].text.strip()

        # Simulate streaming by printing one character at a time.
        for char in bot_reply:
            print(char, end="", flush=True)
            time.sleep(0.02)  # Adjust delay (in seconds) as desired.
        print()  # Newline after bot reply.

        # Append the bot reply to conversation history.
        conversation += bot_reply + "\n"

if __name__ == "__main__":
    main()
Downloads last month
33
Safetensors
Model size
14.8B params
Tensor type
BF16
·
F8_E4M3
·
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Model tree for miike-ai/qwen-14b-coder-fp8

Base model

Qwen/Qwen2.5-14B
Quantized
(72)
this model