CreitinGameplays
/

Llama-3.1-8B-R1-v0.1

Text Generation

text-generation-inference

Model card Files Files and versions

CreitinGameplays commited on Mar 24

Commit

c388a70

·

verified ·

1 Parent(s): 4de3ee7

Update README.md

Files changed (1) hide show

README.md +61 -16

README.md CHANGED Viewed

@@ -23,30 +23,75 @@ Took **28 hours** to finetune on **2x Nvidia RTX A6000** with the following sett
 Run the model:
 ```python
 import torch
-from transformers import pipeline
 model_id = "CreitinGameplays/Llama-3.1-8B-R1-v0.1"
-pipe = pipeline(
-    "text-generation",
-    model=model_id,
     torch_dtype=torch.bfloat16,
-    device_map="auto"
 )
-messages = [
-    {"role": "system", "content": "You are an AI assistant named Llama, made by Meta AI."},
-    {"role": "user", "content": "How many r's are in strawberry?"}
-]
-outputs = pipe(
-    messages,
-    temperature=0.6,
-    repetition_penalty=1.1,
-    max_new_tokens=2048
-)
-print(outputs[0]["generated_text"][-1])
 ```
 ### Current Limitations

 Run the model:
 ```python
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer, BitsAndBytesConfig
+import bitsandbytes
+quantization_config = BitsAndBytesConfig(
+    load_in_8bit=True,
+    llm_int8_enable_fp32_cpu_offload=True
+)
 model_id = "CreitinGameplays/Llama-3.1-8B-R1-v0.1"
+# Initialize model and tokenizer with streaming support
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
     torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=quantization_config
 )
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+# Custom streamer that collects the output into a string while streaming
+class CollectingStreamer(TextStreamer):
+    def __init__(self, tokenizer):
+        super().__init__(tokenizer)
+        self.output = ""
+    def on_llm_new_token(self, token: str, **kwargs):
+        self.output += token
+        print(token, end="", flush=True)  # prints the token as it's generated
+print("Chat session started. Type 'exit' to quit.\n")
+# Initialize chat history as a list of messages
+chat_history = []
+chat_history.append({"role": "system", "content": "You are an AI assistant made by Meta AI."})
+while True:
+    user_input = input("You: ")
+    if user_input.strip().lower() == "exit":
+        break
+    # Append the user message to the chat history
+    chat_history.append({"role": "user", "content": user_input})
+    # Prepare the prompt by formatting the complete chat history
+    inputs = tokenizer.apply_chat_template(
+        chat_history,
+        return_tensors="pt"
+    ).to(model.device)
+    # Create a new streamer for the current generation
+    streamer = CollectingStreamer(tokenizer)
+    # Generate streamed response
+    model.generate(
+        inputs,
+        streamer=streamer,
+        temperature=0.6,
+        top_p=0.9,
+        top_k=50,
+        repetition_penalty=1.1,
+        max_new_tokens=6112,
+        do_sample=True
+    )
+    # The complete response text is stored in streamer.output
+    response_text = streamer.output
+    print("\nAssistant:", response_text)
+    # Append the assistant response to the chat history
+    chat_history.append({"role": "assistant", "content": response_text})
 ```
 ### Current Limitations