File size: 3,140 Bytes
b867d63
6aea303
 
 
322634a
6aea303
 
 
 
 
 
 
 
 
 
b867d63
 
 
 
 
 
6aea303
 
b867d63
 
6aea303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322634a
 
b867d63
0b266af
6aea303
dc060e7
6aea303
b867d63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc060e7
 
b867d63
f9a074f
b867d63
 
dc060e7
 
 
 
 
 
 
b867d63
dc060e7
b867d63
 
 
dc060e7
c53d7c3
 
 
b867d63
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import gradio as gr
import subprocess
import sys
import os
import spaces

# Install the necessary packages that require CUDA
try:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "causal-conv1d>=1.4.0", "--no-build-isolation"])
    subprocess.check_call([sys.executable, "-m", "pip", "install", "mamba-ssm"])
except Exception as e:
    print(f"Warning: Could not install CUDA extensions: {e}")
    print("The model might not work correctly or will be slower.")

# Now import the required libraries
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Define model repository
repo_name = "hanzla/Falcon3-Mamba-R1-v0"

# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(repo_name)

# Load model with appropriate settings
print("Loading model... (this may take some time)")
model = None

try:
    # Try to load the model with GPU acceleration
    model = AutoModelForCausalLM.from_pretrained(
        repo_name,
        device_map="auto",
        torch_dtype=torch.float16,
    )
except Exception as e:
    print(f"Error loading model with GPU: {e}")
    print("Attempting to load with CPU only...")
    try:
        model = AutoModelForCausalLM.from_pretrained(
            repo_name,
            device_map="cpu",
            torch_dtype=torch.float32,
        )
    except Exception as e2:
        print(f"Error loading model with CPU: {e2}")

if model is None:
    print("Could not load the model. Please check the logs.")
else:
    print("Model loaded successfully!")


@spaces.GPU
def generate_response(message, history):
    print(message)
    if model is None:
        return "Sorry, the model could not be loaded. Please check the logs."
    
    messages = [
        {"role": "system", "content": "You are a helpful assistant. You think before answering"},
    ]
    
    # Add chat history to messages
    for h in history:
        messages.append({"role": "user", "content": h[0]})
        messages.append({"role": "assistant", "content": h[1]})
    
    # Add current message
    messages.append({"role": "user", "content": message})
    
    # Generate input text using chat template
    input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    # Tokenize input
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)
    
    # Generate response
    outputs = model.generate(
        input_ids, 
        max_new_tokens=8000,
        temperature=0.7,
        do_sample=True,
    )
    
    # Decode the generated tokens
    generated_tokens = outputs[0][len(input_ids[0]):]
    response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    
    return response

# Create Gradio interface
demo = gr.ChatInterface(
    generate_response,
    title="Falcon3-Mamba-R1-v0 Chat",
    description="Chat with the Falcon3-Mamba-R1-v0 model.",
    examples=[ 
              "How does the surface area of moon compare with that of earth?",
              "Why it takes 8 minutes for sunlight to reach earth?"],
    theme="soft"
)

# Launch the interface
demo.launch()