# --- Required Installs --- # Ensure these are in your requirements.txt for Hugging Face Spaces # gradio # transformers # torch # duckduckgo_search # huggingface_hub # accelerate # bitsandbytes # sentencepiece # spaces <--- Provided by the Spaces environment import gradio as gr import transformers import torch from transformers import pipeline, BitsAndBytesConfig from duckduckgo_search import DDGS import re import time from huggingface_hub import HfApi from spaces import GPU # Directly import GPU from spaces - Crucial for HF Spaces # --- Constants and Configuration --- MODEL_ID = "nvidia/Llama-3.1-Nemotron-8B-UltraLong-4M-Instruct" MAX_GPU_MEMORY = "40GiB" # A100 memory allocation # --- Model Loading --- @GPU(memory=40) # ****** THIS DECORATOR IS ESSENTIAL FOR SPACES STARTUP ****** def load_model(): """Load the LLM model optimized for A100 GPU using 4-bit quantization.""" print(f"Attempting to load model: {MODEL_ID} with 4-bit quantization") try: # Configure quantization for 4-bit quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", # NF4 is often recommended bnb_4bit_compute_dtype=torch.bfloat16, # Use bfloat16 for compute bnb_4bit_use_double_quant=True, # Use double quantization to save more memory ) # Device map will handle placing layers, relying on accelerate # No need to explicitly set max_memory when using device_map="auto" typically pipe = pipeline( "text-generation", model=MODEL_ID, # Note: torch_dtype is sometimes ignored when quantization_config is used, # but specifying compute_dtype in BitsAndBytesConfig is key. # Keep torch_dtype=torch.bfloat16 here for consistency if needed by other parts. torch_dtype=torch.bfloat16, device_map="auto", # Let accelerate handle layer placement model_kwargs={ "quantization_config": quantization_config, "use_cache": True, # "trust_remote_code=True" # Add if model requires it (check model card) } ) print(f"Model {MODEL_ID} loaded successfully on device: {pipe.device} (using 4-bit quantization)") return pipe except Exception as e: print(f"FATAL Error loading model '{MODEL_ID}' (check memory/config): {e}") # Raise the error to ensure it's visible in Spaces logs raise e # --- Web Search --- # (Keep search_person, create_synthetic_profile, extract_text_from_search_results as before) def search_person(name, context=""): """Search for information about a person using DuckDuckGo.""" print(f"Searching for: {name} with context: {context}") results = [] search_terms = [] if context: search_terms.append(f"{name} {context}") grade_match = re.search(r'(\d+)(?:st|nd|rd|th)?\s+grade', context.lower()) if grade_match: grade = grade_match.group(1) search_terms.append(f"{name} student {grade} grade") search_terms.append(f"{name}") search_terms.append(f"{name} biography") search_terms.append(f"{name} interests") search_terms.append(f"{name} personality") search_terms = list(dict.fromkeys(search_terms)) print(f"Using search terms: {search_terms}") try: with DDGS() as ddgs: for term in search_terms: print(f"Searching DDG for: '{term}'") search_results = list(ddgs.text(term, max_results=2)) results.extend(search_results) time.sleep(0.2) except Exception as e: error_msg = f"Error during DuckDuckGo search: {str(e)}" print(error_msg) return error_msg if not results: print(f"No search results found for {name}. Creating synthetic profile.") return create_synthetic_profile(name, context) print(f"Found {len(results)} potential search results.") return results def create_synthetic_profile(name, context): """Create a synthetic profile when search returns no results.""" profile = { "title": f"Synthetic Profile for {name}", "href": "", "body": f"{name} is a person described with the context: '{context}'. " } if "grade" in context.lower(): grade_match = re.search(r'(\d+)(?:st|nd|rd|th)?\s+grade', context.lower()) if grade_match: try: grade = int(grade_match.group(1)) age = 5 + grade profile["body"] += f"Based on being in {grade}th grade, {name} is likely around {age} years old. " profile["body"] += f"Typical interests for this age might include friends, hobbies, school subjects, and developing independence. " except ValueError: profile["body"] += f"The grade mentioned ('{grade_match.group(1)}') could not be parsed to estimate age. " profile["body"] += "Since no public information was found, this profile is based solely on the provided context." return [profile] def extract_text_from_search_results(search_results): """Extract relevant text from search results.""" if isinstance(search_results, str): return f"Could not extract text due to search error: {search_results}" combined_text = "" seen_bodies = set() count = 0 max_results_to_process = 5 for result in search_results: if count >= max_results_to_process: break if isinstance(result, dict) and 'body' in result and result['body']: body = result['body'].strip() if body not in seen_bodies: combined_text += body + "\n\n" seen_bodies.add(body) count += 1 if not combined_text: return "No relevant text found in search results." combined_text = re.sub(r'\s+', ' ', combined_text).strip() max_length = 2000 return combined_text[:max_length] + "..." if len(combined_text) > max_length else combined_text # --- LLM Generation Functions --- def parse_llm_output(full_output, input_prompt_list): """Attempts to parse only the newly generated text from the LLM output.""" if isinstance(full_output, list) and len(full_output) > 0: if isinstance(full_output[0], dict) and "generated_text" in full_output[0]: generated_text = full_output[0]["generated_text"] else: return str(full_output) elif isinstance(full_output, str): generated_text = full_output else: return str(full_output) last_input_content = "" if isinstance(input_prompt_list, list) and input_prompt_list: # Find the last message with 'user' or 'system' role potentially? # Let's stick to finding the last message content for simplicity last_input_content = input_prompt_list[-1].get("content", "") if last_input_content: last_occurrence_index = generated_text.rfind(last_input_content) if last_occurrence_index != -1: potential_response = generated_text[last_occurrence_index + len(last_input_content):].strip() if potential_response: # Basic cleanup potential_response = re.sub(r'^<\/?s?>', '', potential_response).strip() potential_response = re.sub(r'^(assistant|ASSISTANT|System|SYSTEM)[:\s]*', '', potential_response).strip() # Check if the response is just whitespace or seems empty after cleanup if potential_response: return potential_response # Fallback or if model correctly outputted only the response cleaned_text = generated_text if isinstance(input_prompt_list, list) and input_prompt_list: first_prompt_content = input_prompt_list[0].get("content", "") if first_prompt_content and cleaned_text.startswith(first_prompt_content): # Be careful not to strip if the response happens to start the same way pass # Let's rely more on the end-stripping heuristic above # General cleanup cleaned_text = re.sub(r'^<\/?s?>', '', cleaned_text).strip() cleaned_text = re.sub(r'^(assistant|ASSISTANT|System|SYSTEM)[:\s]*', '', cleaned_text).strip() # If after all this, it's empty, maybe return original generated_text? # Or log a warning and return the cleaned version. if not cleaned_text and generated_text: print("Warning: Parsing resulted in empty string, returning original generation.") return generated_text # Return original if cleaning failed # If input prompt wasn't found, assume the model outputted only the response (ideal case) # or the whole thing (fallback case). The cleaning helps for the latter. if last_input_content and last_occurrence_index == -1: print("Warning: Could not find last input prompt in LLM output. Returning cleaned full output.") return cleaned_text @GPU(memory=40) # Decorator needed for Spaces resource allocation during calls def generate_enhanced_persona(model, name, bio_text, context=""): """Use the LLM to enhance the persona profile.""" print(f"Generating enhanced persona for {name}...") if model is None: raise ValueError("Model is not loaded.") enhancement_prompt = [ {"role": "system", "content": """You are an expert AI character developer. Your task is to synthesize information into a detailed and coherent character profile. Focus on personality, potential interests, speaking style, and mannerisms based ONLY on the provided text. If the text indicates the character is a child, ensure the profile reflects age-appropriate traits. Output ONLY the enhanced character profile description. Do not include conversational introductions, explanations, apologies for limited info, or markdown formatting like headers (e.g., ### Personality). Start directly with the profile text."""}, {"role": "user", "content": f"""Synthesize the following information about '{name}' into a character profile. Context: {context} Information Found:\n{bio_text}\n\nCreate the profile based *only* on the text above."""} ] try: with torch.amp.autocast('cuda', dtype=torch.bfloat16): outputs = model(enhancement_prompt, max_new_tokens=512, do_sample=True, temperature=0.7, top_p=0.9) parsed_output = parse_llm_output(outputs, enhancement_prompt) print("Enhanced persona generated.") return parsed_output if parsed_output else f"Could not generate profile based on:\n{bio_text}" except Exception as e: error_msg = f"Error generating enhanced persona: {str(e)}" print(error_msg) return f"Error enhancing profile: {str(e)}\n\nUsing basic info:\n{bio_text}" @GPU(memory=40) # Decorator needed for Spaces resource allocation during calls def generate_system_prompt_with_llm(model, name, enhanced_profile, context=""): """Generate an optimized system prompt for the persona.""" print(f"Generating system prompt for {name}...") if model is None: raise ValueError("Model is not loaded.") fallback_prompt = f"""You are simulating the character '{name}'. Act and respond according to this profile:\n{enhanced_profile}\nAdditional context for the simulation: {context}\n---\nMaintain this persona consistently. Respond naturally based on the profile. Do not mention that you are an AI or a simulation. If asked about details not in the profile, you can be evasive or state you don't know/remember, consistent with the persona.""" prompt = [ {"role": "system", "content": """You are an expert AI prompt engineer specializing in character simulation. Your task is to create a concise and effective system prompt for an LLM that will simulate a character based on a provided profile. The system prompt should instruct the LLM to embody the character, covering: 1. Core personality, attitude, and speaking style (based on the profile). 2. Key interests or knowledge areas (if mentioned in the profile). 3. How to handle questions outside its knowledge (e.g., be evasive, admit ignorance naturally). 4. Explicitly state it should *not* break character or mention being an AI. 5. Incorporate age-appropriateness if the profile suggests a specific age group. Output ONLY the system prompt itself. Do not add any explanation or introductory text."""}, {"role": "user", "content": f"""Create a system prompt for an AI to simulate the character '{name}'. Context for simulation: {context} Character Profile:\n{enhanced_profile}\n\nGenerate the system prompt based *only* on the profile and context provided."""} ] try: with torch.amp.autocast('cuda', dtype=torch.bfloat16): outputs = model(prompt, max_new_tokens=300, do_sample=True, temperature=0.6) parsed_output = parse_llm_output(outputs, prompt) print("System prompt generated.") return parsed_output if parsed_output else fallback_prompt except Exception as e: error_msg = f"Error generating system prompt: {str(e)}" print(error_msg) return fallback_prompt @GPU(memory=40) # Decorator needed for Spaces resource allocation during calls def generate_response(model, messages): """Generate a response using the LLM.""" print("Generating response...") if model is None: raise ValueError("Model is not loaded.") if not messages: return "Error: No message history provided." try: with torch.amp.autocast('cuda', dtype=torch.bfloat16): # Ensure pad_token_id is set correctly if needed, especially for batching or specific models outputs = model( messages, max_new_tokens=512, do_sample=True, top_p=0.9, temperature=0.7, use_cache=True, # Check if EOS token is needed for this model/pipeline setup pad_token_id=model.tokenizer.eos_token_id if model.tokenizer.eos_token_id else None ) parsed_output = parse_llm_output(outputs, messages) print("Response generated.") return parsed_output if parsed_output else "..." except Exception as e: error_msg = f"Error during response generation: {str(e)}" print(error_msg) # Consider if the specific error should be shown to the user return f"Sorry, I encountered an error trying to respond." # --- Persona Chat Class --- class PersonaChat: def __init__(self): self.model = None self.system_prompt = "You are a helpful assistant." self.persona_name = "Assistant" self.persona_context = "" self.messages = [] self.enhanced_profile = "" self.model_loaded = False # No @GPU decorator needed here typically, as it calls functions that ARE decorated def load_model_if_needed(self): """Loads the model if it hasn't been loaded successfully.""" if not self.model_loaded or self.model is None: # Check self.model too print("Model not loaded or instance lost. Attempting to load...") # Call the @GPU decorated load_model function self.model = load_model() # This function IS decorated if self.model is None: # load_model now raises error, but double-check here raise RuntimeError("Failed to load the language model. Cannot proceed.") else: self.model_loaded = True print("Model loaded successfully within PersonaChat instance.") # else: print("Model already loaded.") # Reduce log noise # No @GPU decorator needed here typically def set_persona(self, name, context=""): """Orchestrates persona creation: search, enhance, generate prompt.""" # This method calls other functions that have @GPU decorators try: self.load_model_if_needed() # Ensures model is ready self.persona_name = name self.persona_context = context self.messages = [] self.enhanced_profile = "" status = f"Searching for information about {name}..." yield status, "", "", [{"role": "system", "content": "Initializing persona creation..."}] # Added empty profile yield search_results = search_person(name, context) if isinstance(search_results, str) and search_results.startswith("Error"): error_msg = f"Failed to set persona: {search_results}" yield error_msg, "", "", [{"role": "system", "content": error_msg}] return bio_text = extract_text_from_search_results(search_results) if bio_text.startswith("Could not extract text"): yield f"Warning: {bio_text}", "", "", [{"role": "system", "content": bio_text}] status = f"Creating enhanced profile for {name}..." yield status, "", bio_text, [{"role": "system", "content": status}] # Show basic bio while enhancing # Call the @GPU decorated function self.enhanced_profile = generate_enhanced_persona(self.model, name, bio_text, context) profile_for_prompt = self.enhanced_profile if self.enhanced_profile.startswith("Error enhancing profile"): yield f"Warning: Could not enhance profile. Using basic info.", "", self.enhanced_profile, [{"role": "system", "content": self.enhanced_profile}] profile_for_prompt = bio_text # Fallback status = f"Generating optimal system prompt for {name}..." # Yield the enhanced profile while generating prompt yield status, self.enhanced_profile, self.enhanced_profile, [{"role": "system", "content": status}] # Call the @GPU decorated function self.system_prompt = generate_system_prompt_with_llm(self.model, name, profile_for_prompt, context) self.messages = [{"role": "system", "content": self.system_prompt}] yield f"Persona set to '{name}'. Ready to chat!", self.system_prompt, self.enhanced_profile, self.messages except RuntimeError as e: error_msg = f"Critical Error: {str(e)}" print(error_msg) yield error_msg, "", "", [{"role": "system", "content": error_msg}] except Exception as e: error_msg = f"An unexpected error occurred during persona setup: {str(e)}" print(error_msg) # Attempt to yield current state even on error yield error_msg, self.system_prompt, self.enhanced_profile, [{"role": "system", "content": error_msg}] # No @GPU decorator needed here typically def chat(self, user_message): """Processes a user message and returns the AI's response.""" # This method calls generate_response which has the @GPU decorator try: self.load_model_if_needed() if not self.messages: print("Error: Chat called before persona was set.") return "Please set a persona first using the controls above." print(f"User message: {user_message}") formatted_message = {"role": "user", "content": user_message} # Keep internal history, pass copy to model if needed, but pipeline usually handles state self.messages.append(formatted_message) # Call the @GPU decorated function response = generate_response(self.model, self.messages) # Append assistant response IF generation succeeded if not response.startswith("Sorry, I encountered an error"): assistant_message = {"role": "assistant", "content": response} self.messages.append(assistant_message) print(f"Assistant response: {response}") else: print(f"Assistant error response: {response}") # Do not add the error message itself to the persistent history # Let the UI show the error, but don't make the bot repeat it next turn. return response except RuntimeError as e: error_msg = f"Critical Error: {str(e)}. Cannot generate response." print(error_msg) return error_msg except Exception as e: error_msg = f"Error generating response: {str(e)}" print(error_msg) return f"Sorry, I encountered an error: {str(e)}" # --- Gradio Interface --- def create_interface(): persona_chat = PersonaChat() # Instantiate the handler class css = """ .gradio-container { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; } .main-container { max-width: 1200px; margin: auto; padding: 0; } .header { background: linear-gradient(90deg, #2c3e50, #4ca1af); color: white; padding: 20px; border-radius: 10px 10px 0 0; margin-bottom: 20px; text-align: center; } .setup-section { background-color: #f9f9f9; border-radius: 10px; padding: 20px; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); margin-bottom: 20px; } .chat-section { background-color: white; border-radius: 10px; padding: 20px; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); } .status-bar { background: #e9ecef; padding: 10px 15px; border-radius: 5px; margin: 15px 0; font-weight: 500; border: 1px solid #ced4da; } .chat-container { border: 1px solid #eaeaea; border-radius: 10px; height: 500px !important; overflow-y: auto; background-color: #ffffff; padding: 10px; } .message-input { margin-top: 10px; } .send-button { background-color: #2c3e50 !important; color: white !important; } .persona-button { background-color: #4ca1af !important; color: white !important; } .system-prompt-display { background-color: #f5f5f5; border-radius: 8px; padding: 15px; margin-top: 15px; border: 1px solid #e0e0e0; font-family: monospace; white-space: pre-wrap; word-wrap: break-word; } .footer { text-align: center; margin-top: 20px; font-size: 0.9rem; color: #666; } /* Use default chatbot message styling provided by type='messages' */ .typing-indicator { color: #aaa; font-style: italic; } """ with gr.Blocks(css=css, title="AI Persona Simulator") as interface: with gr.Row(elem_classes="main-container"): with gr.Column(): with gr.Column(elem_classes="header"): gr.Markdown("# AI Persona Simulator") gr.Markdown("Create and interact with AI-driven character simulations") with gr.Column(elem_classes="setup-section"): gr.Markdown("### 1. Create Your Persona") gr.Markdown("Enter a name and context. The AI will search, build a profile, and prepare for chat.") with gr.Row(): name_input = gr.Textbox(label="Character Name", placeholder="e.g., Sherlock Holmes, Erenalp, A curious 7th grader", elem_id="name_input") context_input = gr.Textbox(label="Character Context / Description", placeholder="e.g., Living in 221B Baker Street, London. OR 7th grade, loves math...", lines=2, elem_id="context_input") set_persona_button = gr.Button("Create Persona & Start Chat", variant="primary", elem_classes="persona-button") status_output = gr.Textbox(label="Status", value="Enter details above and click 'Create Persona'.", interactive=False, elem_classes="status-bar") with gr.Accordion("View Generated Details", open=False): enhanced_profile_display = gr.TextArea(label="Enhanced Profile (Generated by AI)", interactive=False, lines=10, elem_classes="system-prompt-display") system_prompt_display = gr.TextArea(label="System Prompt (Instructions for the AI)", interactive=False, lines=10, elem_classes="system-prompt-display") with gr.Column(elem_classes="chat-section"): gr.Markdown("### 2. Chat with Your Character") character_name_display = gr.Markdown(value="*No persona created yet*", elem_id="character-name-display") # ***** FIX GRADIO WARNINGS ***** chatbot = gr.Chatbot( label="Conversation", height=450, elem_classes="chat-container", # bubble_full_width=False, # Deprecated avatar_images=(None, "🤖"), # User default, Bot emoji type="messages" # ***** USE RECOMMENDED TYPE ***** ) with gr.Row(): msg_input = gr.Textbox(label="Your message", placeholder="Type your message here and press Enter...", elem_classes="message-input", scale=4) send_button = gr.Button("Send", variant="primary", elem_classes="send-button", scale=1) with gr.Column(elem_classes="footer"): gr.Markdown(f"Powered by {MODEL_ID}") # --- Event Handlers --- def set_persona_flow(name, context): if not name: yield "Status: Please enter a character name.", "", "", "*No persona created yet*", [] return initial_status = f"Creating persona for '{name}'..." initial_character_display = f"### Preparing to chat with {name}..." initial_prompt = "System prompt will appear here..." initial_profile = "Enhanced profile will appear here..." # Start with empty history for messages type initial_history = [] yield initial_status, initial_prompt, initial_profile, initial_character_display, initial_history final_status, final_prompt, final_profile = "Error", "", "" final_history = initial_history try: # Use the PersonaChat instance's method generator # Expected yield order: status, system_prompt, enhanced_profile, messages_list for status_update, prompt_update, profile_update, history_update in persona_chat.set_persona(name, context): final_status, final_prompt, final_profile = status_update, prompt_update, profile_update if isinstance(history_update, list): final_history = history_update character_display = f"### Preparing chat with {name}..." if "Ready to chat" in status_update: character_display = f"### Chatting with {name}" elif "Error" in status_update: character_display = f"### Error creating {name}" yield status_update, final_prompt, final_profile, character_display, final_history time.sleep(0.1) # Small delay for UI update visibility except Exception as e: error_msg = f"Failed to set persona (interface error): {str(e)}" print(error_msg) # Try to yield error state yield error_msg, final_prompt, final_profile, f"### Error creating {name}", final_history def send_message_flow(message, history): # Ensure history is a list (for messages type) if history is None: history = [] if not message.strip(): return "", history # Check if persona is ready (looks for system message in internal state) if not persona_chat.messages or persona_chat.messages[0]['role'] != 'system': history.append({"role": "user", "content": message}) history.append({"role": "assistant", "content": "Error: Please create a valid persona first."}) return "", history # Append user message to UI history history.append({"role": "user", "content": message}) # Append placeholder for bot response (typing indicator) history.append({"role": "assistant", "content": None}) # Use None for typing indicator with type='messages' yield "", history # Update UI to show user msg + typing # Call chat method (uses internal state, returns string response) response_text = persona_chat.chat(message) # Update the placeholder in UI history with the actual response history[-1]["content"] = response_text yield "", history # Update UI with final response set_persona_button.click( set_persona_flow, inputs=[name_input, context_input], outputs=[status_output, system_prompt_display, enhanced_profile_display, character_name_display, chatbot] ) send_button.click( send_message_flow, inputs=[msg_input, chatbot], outputs=[msg_input, chatbot] ) msg_input.submit( send_message_flow, inputs=[msg_input, chatbot], outputs=[msg_input, chatbot] ) return interface # --- Main Execution --- if __name__ == "__main__": print("Starting Gradio application for Hugging Face Spaces...") demo = create_interface() demo.queue().launch( # queue() is recommended for Spaces server_name="0.0.0.0", server_port=7860, # share=False is default and usually needed for Spaces deployment structure show_error=True, # Good for debugging in Spaces logs debug=True # More verbose logging )