Spaces:

hanzla
/

Falcon3MambaReasoner

Running on Zero

File size: 3,140 Bytes

import gradio as gr
import subprocess
import sys
import os
import spaces

# Install the necessary packages that require CUDA
try:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "causal-conv1d>=1.4.0", "--no-build-isolation"])
    subprocess.check_call([sys.executable, "-m", "pip", "install", "mamba-ssm"])
except Exception as e:
    print(f"Warning: Could not install CUDA extensions: {e}")
    print("The model might not work correctly or will be slower.")

# Now import the required libraries
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Define model repository
repo_name = "hanzla/Falcon3-Mamba-R1-v0"

# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(repo_name)

# Load model with appropriate settings
print("Loading model... (this may take some time)")
model = None

try:
    # Try to load the model with GPU acceleration
    model = AutoModelForCausalLM.from_pretrained(
        repo_name,
        device_map="auto",
        torch_dtype=torch.float16,
    )
except Exception as e:
    print(f"Error loading model with GPU: {e}")
    print("Attempting to load with CPU only...")
    try:
        model = AutoModelForCausalLM.from_pretrained(
            repo_name,
            device_map="cpu",
            torch_dtype=torch.float32,
        )
    except Exception as e2:
        print(f"Error loading model with CPU: {e2}")

if model is None:
    print("Could not load the model. Please check the logs.")
else:
    print("Model loaded successfully!")


@spaces.GPU
def generate_response(message, history):
    print(message)
    if model is None:
        return "Sorry, the model could not be loaded. Please check the logs."
    
    messages = [
        {"role": "system", "content": "You are a helpful assistant. You think before answering"},
    ]
    
    # Add chat history to messages
    for h in history:
        messages.append({"role": "user", "content": h[0]})
        messages.append({"role": "assistant", "content": h[1]})
    
    # Add current message
    messages.append({"role": "user", "content": message})
    
    # Generate input text using chat template
    input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    # Tokenize input
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)
    
    # Generate response
    outputs = model.generate(
        input_ids, 
        max_new_tokens=8000,
        temperature=0.7,
        do_sample=True,
    )
    
    # Decode the generated tokens
    generated_tokens = outputs[0][len(input_ids[0]):]
    response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    
    return response

# Create Gradio interface
demo = gr.ChatInterface(
    generate_response,
    title="Falcon3-Mamba-R1-v0 Chat",
    description="Chat with the Falcon3-Mamba-R1-v0 model.",
    examples=[ 
              "How does the surface area of moon compare with that of earth?",
              "Why it takes 8 minutes for sunlight to reach earth?"],
    theme="soft"
)

# Launch the interface
demo.launch()