Spaces:

prithivMLmods
/

FLUX-REALISM

Running on Zero

App Files Files Community

prithivMLmods commited on 14 days ago

Commit

5663d15

verified ·

1 Parent(s): fe1e01e

Update app.py

Browse files

Files changed (1) hide show

app.py +80 -114

app.py CHANGED Viewed

@@ -22,10 +22,15 @@ subprocess.run(
     shell=True
 )
 # -------------------------------
 # CONFIGURATION & UTILITY FUNCTIONS
 # -------------------------------
-MAX_SEED = np.iinfo(np.int32).max
 def save_image(img: Image.Image) -> str:
     """Save a PIL image with a unique filename and return its path."""
@@ -38,79 +43,66 @@ def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
         seed = random.randint(0, MAX_SEED)
     return seed
-# Determine preferred torch dtype based on GPU support.
-bf16_supported = torch.cuda.is_bf16_supported()
-preferred_dtype = torch.bfloat16 if bf16_supported else torch.float16
 # -------------------------------
-# FLUX.1 IMAGE GENERATION SETUP
 # -------------------------------
 from diffusers import DiffusionPipeline
 base_model = "black-forest-labs/FLUX.1-dev"
-pipe = DiffusionPipeline.from_pretrained(base_model, torch_dtype=preferred_dtype)
-lora_repo = "strangerzonehf/Flux-Super-Realism-LoRA"
-trigger_word = "Super Realism"  # Leave blank if no trigger word is needed.
 pipe.load_lora_weights(lora_repo)
 pipe.to("cuda")
-# Define style prompts for Flux.1
-style_list = [
-    {
-        "name": "3840 x 2160",
-        "prompt": "hyper-realistic 8K image of {prompt}. ultra-detailed, lifelike, high-resolution, sharp, vibrant colors, photorealistic",
-    },
-    {
-        "name": "2560 x 1440",
-        "prompt": "hyper-realistic 4K image of {prompt}. ultra-detailed, lifelike, high-resolution, sharp, vibrant colors, photorealistic",
-    },
-    {
-        "name": "HD+",
-        "prompt": "hyper-realistic 2K image of {prompt}. ultra-detailed, lifelike, high-resolution, sharp, vibrant colors, photorealistic",
-    },
-    {
-        "name": "Style Zero",
-        "prompt": "{prompt}",
-    },
-]
-styles = {s["name"]: s["prompt"] for s in style_list}
-DEFAULT_STYLE_NAME = "3840 x 2160"
-STYLE_NAMES = list(styles.keys())
-def apply_style(style_name: str, positive: str) -> str:
-    return styles.get(style_name, styles[DEFAULT_STYLE_NAME]).replace("{prompt}", positive)
-@spaces.GPU(duration=60, enable_queue=True)
-def generate_image_flux(
-    prompt: str,
-    seed: int = 0,
-    width: int = 1024,
-    height: int = 1024,
-    guidance_scale: float = 3,
-    randomize_seed: bool = False,
-    style_name: str = DEFAULT_STYLE_NAME,
-    progress=gr.Progress(track_tqdm=True),
-):
-    """Generate an image using the Flux.1 pipeline with a chosen style."""
-    torch.cuda.empty_cache()  # Clear unused GPU memory to prevent allocation errors
-    seed = int(randomize_seed_fn(seed, randomize_seed))
-    positive_prompt = apply_style(style_name, prompt)
-    if trigger_word:
-        positive_prompt = f"{trigger_word} {positive_prompt}"
-    # Wrap the diffusion call in no_grad to avoid unnecessary gradient state.
-    with torch.no_grad():
-        images = pipe(
-            prompt=positive_prompt,
-            width=width,
-            height=height,
-            guidance_scale=guidance_scale,
-            num_inference_steps=28,
-            num_images_per_prompt=1,
-            output_type="pil",
-        ).images
-        torch.cuda.synchronize()  # Ensure all CUDA operations have completed
-    image_paths = [save_image(img) for img in images]
-    return image_paths, seed
 # -------------------------------
 # SMOLVLM2 SETUP (Default Text/Multimodal Model)
@@ -121,31 +113,12 @@ smol_processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Inst
 smol_model = AutoModelForImageTextToText.from_pretrained(
     "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
     _attn_implementation="flash_attention_2",
-    torch_dtype=preferred_dtype
 ).to("cuda:0")
 # -------------------------------
-# UTILITY FUNCTIONS
 # -------------------------------
-def progress_bar_html(label: str) -> str:
-    """
-    Returns an HTML snippet for an animated progress bar with a given label.
-    """
-    return f'''
-<div style="display: flex; align-items: center;">
-    <span style="margin-right: 10px; font-size: 14px;">{label}</span>
-    <div style="width: 110px; height: 5px; background-color: #FFC0CB; border-radius: 2px; overflow: hidden;">
-        <div style="width: 100%; height: 100%; background-color: #FF69B4; animation: loading 1.5s linear infinite;"></div>
-    </div>
-</div>
-<style>
-@keyframes loading {{
-    0% {{ transform: translateX(-100%); }}
-    100% {{ transform: translateX(100%); }}
-}}
-</style>
-    '''
 TTS_VOICES = [
     "en-US-JennyNeural",  # @tts1
     "en-US-GuyNeural",    # @tts2
@@ -161,36 +134,32 @@ async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
 # CHAT / MULTIMODAL GENERATION FUNCTION
 # -------------------------------
 @spaces.GPU
-def generate(
-    input_dict: dict,
-    chat_history: list[dict],
-    max_tokens: int = 200,
-):
     """
-    Generates chatbot responses using SmolVLM2 by default—with support for multimodal inputs and TTS.
     Special commands:
-      - "@image": triggers image generation using the Flux.1 pipeline.
       - "@tts1" or "@tts2": triggers text-to-speech after generation.
     """
-    torch.cuda.empty_cache()  # Clear unused GPU memory for consistency
     text = input_dict["text"]
     files = input_dict.get("files", [])
-    # If the query starts with "@image", use Flux.1 to generate an image.
     if text.strip().lower().startswith("@image"):
         prompt = text[len("@image"):].strip()
-        yield progress_bar_html("Hold Tight Generating Flux.1 Image")
-        image_paths, used_seed = generate_image_flux(
-            prompt=prompt,
-            seed=1,
-            width=1024,
-            height=1024,
-            guidance_scale=3,
-            randomize_seed=True,
-            style_name=DEFAULT_STYLE_NAME,
-            progress=gr.Progress(track_tqdm=True),
-        )
-        yield gr.Image(image_paths[0])
         return
     # Handle TTS commands if present.
@@ -203,7 +172,6 @@ def generate(
             voice = TTS_VOICES[voice_index - 1]
             text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
-    # Use SmolVLM2 for chat/multimodal text generation.
     yield "Processing with SmolVLM2"
     # Build conversation messages based on input and history.
@@ -272,7 +240,6 @@ def generate(
         yield "Please input a text query along with the image(s)."
         return
-    print("resulting_messages", resulting_messages)
     inputs = smol_processor.apply_chat_template(
         resulting_messages,
         add_generation_prompt=True,
@@ -280,9 +247,8 @@ def generate(
         return_dict=True,
         return_tensors="pt",
     )
-    # Explicitly cast pixel values to the preferred dtype to match model weights.
     if "pixel_values" in inputs:
-        inputs["pixel_values"] = inputs["pixel_values"].to(preferred_dtype)
     inputs = inputs.to(smol_model.device)
     streamer = TextIteratorStreamer(smol_processor, skip_prompt=True, skip_special_tokens=True)
@@ -305,7 +271,7 @@ def generate(
 # -------------------------------
 # GRADIO CHAT INTERFACE
 # -------------------------------
-DESCRIPTION = "# Flux.1 Realism 🥖 + SmolVLM2 Chat"
 if not torch.cuda.is_available():
     DESCRIPTION += "\n<p>⚠️Running on CPU, this may not work as expected.</p>"
@@ -328,7 +294,7 @@ demo = gr.ChatInterface(
         gr.Slider(minimum=100, maximum=500, step=50, value=200, label="Max Tokens"),
     ],
     examples=[
-        [{"text": "@image A futuristic cityscape at dusk in hyper-realistic 8K"}],
         [{"text": "Describe this image.", "files": ["example_images/mosque.jpg"]}],
         [{"text": "What does this document say?", "files": ["example_images/document.jpg"]}],
         [{"text": "@tts1 Explain the weather patterns shown in this diagram.", "files": ["example_images/examples_weather_events.png"]}],
@@ -342,7 +308,7 @@ demo = gr.ChatInterface(
         label="Query Input",
         file_types=["image", ".mp4"],
         file_count="multiple",
-        placeholder="Type text and/or upload media. Use '@image' for Flux.1 image gen, '@tts1' or '@tts2' for TTS."
     ),
     stop_btn="Stop Generation",
     multimodal=True,

     shell=True
 )
+# Set torch backend configurations for Flux RealismLora
+torch.backends.cudnn.deterministic = True
+torch.backends.cudnn.benchmark = False
+torch.backends.cuda.matmul.allow_tf32 = True
 # -------------------------------
 # CONFIGURATION & UTILITY FUNCTIONS
 # -------------------------------
+MAX_SEED = 2**32 - 1
 def save_image(img: Image.Image) -> str:
     """Save a PIL image with a unique filename and return its path."""
         seed = random.randint(0, MAX_SEED)
     return seed
+def progress_bar_html(label: str) -> str:
+    """
+    Returns an HTML snippet for an animated progress bar with a given label.
+    """
+    return f'''
+<div style="display: flex; align-items: center;">
+    <span style="margin-right: 10px; font-size: 14px;">{label}</span>
+    <div style="width: 110px; height: 5px; background-color: #FFC0CB; border-radius: 2px; overflow: hidden;">
+        <div style="width: 100%; height: 100%; background-color: #FF69B4; animation: loading 1.5s linear infinite;"></div>
+    </div>
+</div>
+<style>
+@keyframes loading {{
+    0% {{ transform: translateX(-100%); }}
+    100% {{ transform: translateX(100%); }}
+}}
+</style>
+    '''
 # -------------------------------
+# FLUX REALISMLORA IMAGE GENERATION SETUP (New Implementation)
 # -------------------------------
 from diffusers import DiffusionPipeline
 base_model = "black-forest-labs/FLUX.1-dev"
+pipe = DiffusionPipeline.from_pretrained(base_model, torch_dtype=torch.bfloat16)
+lora_repo = "XLabs-AI/flux-RealismLora"
+trigger_word = ""  # No trigger word used.
 pipe.load_lora_weights(lora_repo)
 pipe.to("cuda")
+@spaces.GPU()
+def run_lora(prompt, cfg_scale, steps, randomize_seed, seed, width, height, lora_scale, progress=gr.Progress(track_tqdm=True)):
+    # Set random seed for reproducibility
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    generator = torch.Generator(device="cuda").manual_seed(seed)
+    # Update progress bar (0% at start)
+    progress(0, "Starting image generation...")
+    # Simulate progress updates during the steps
+    for i in range(1, steps + 1):
+        if steps >= 10 and i % (steps // 10) == 0:
+            progress(i / steps * 100, f"Processing step {i} of {steps}...")
+    # Generate image using the pipeline
+    image = pipe(
+        prompt=f"{prompt} {trigger_word}",
+        num_inference_steps=steps,
+        guidance_scale=cfg_scale,
+        width=width,
+        height=height,
+        generator=generator,
+        joint_attention_kwargs={"scale": lora_scale},
+    ).images[0]
+    # Final progress update (100%)
+    progress(100, "Completed!")
+    yield image, seed
 # -------------------------------
 # SMOLVLM2 SETUP (Default Text/Multimodal Model)
 smol_model = AutoModelForImageTextToText.from_pretrained(
     "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
     _attn_implementation="flash_attention_2",
+    torch_dtype=torch.float16
 ).to("cuda:0")
 # -------------------------------
+# TTS UTILITY FUNCTIONS
 # -------------------------------
 TTS_VOICES = [
     "en-US-JennyNeural",  # @tts1
     "en-US-GuyNeural",    # @tts2
 # CHAT / MULTIMODAL GENERATION FUNCTION
 # -------------------------------
 @spaces.GPU
+def generate(input_dict: dict, chat_history: list[dict], max_tokens: int = 200):
     """
+    Generates chatbot responses using SmolVLM2 with support for multimodal inputs and TTS.
     Special commands:
+      - "@image": triggers image generation using the RealismLora flux implementation.
       - "@tts1" or "@tts2": triggers text-to-speech after generation.
     """
+    torch.cuda.empty_cache()
     text = input_dict["text"]
     files = input_dict.get("files", [])
+    # If the query starts with "@image", use RealismLora to generate an image.
     if text.strip().lower().startswith("@image"):
         prompt = text[len("@image"):].strip()
+        yield progress_bar_html("Hold Tight Generating Flux RealismLora Image")
+        # Default parameters for RealismLora generation
+        default_cfg_scale = 3.2
+        default_steps = 32
+        default_width = 1152
+        default_height = 896
+        default_seed = 3981632454
+        default_lora_scale = 0.85
+        # Call the new run_lora function and yield its final result
+        for result in run_lora(prompt, default_cfg_scale, default_steps, True, default_seed, default_width, default_height, default_lora_scale, progress=gr.Progress(track_tqdm=True)):
+            final_result = result
+        yield gr.Image(final_result[0])
         return
     # Handle TTS commands if present.
             voice = TTS_VOICES[voice_index - 1]
             text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
     yield "Processing with SmolVLM2"
     # Build conversation messages based on input and history.
         yield "Please input a text query along with the image(s)."
         return
     inputs = smol_processor.apply_chat_template(
         resulting_messages,
         add_generation_prompt=True,
         return_dict=True,
         return_tensors="pt",
     )
     if "pixel_values" in inputs:
+        inputs["pixel_values"] = inputs["pixel_values"].to(torch.float16)
     inputs = inputs.to(smol_model.device)
     streamer = TextIteratorStreamer(smol_processor, skip_prompt=True, skip_special_tokens=True)
 # -------------------------------
 # GRADIO CHAT INTERFACE
 # -------------------------------
+DESCRIPTION = "# Flux RealismLora + SmolVLM2 Chat"
 if not torch.cuda.is_available():
     DESCRIPTION += "\n<p>⚠️Running on CPU, this may not work as expected.</p>"
         gr.Slider(minimum=100, maximum=500, step=50, value=200, label="Max Tokens"),
     ],
     examples=[
+        [{"text": "@image A futuristic cityscape at dusk in hyper-realistic style"}],
         [{"text": "Describe this image.", "files": ["example_images/mosque.jpg"]}],
         [{"text": "What does this document say?", "files": ["example_images/document.jpg"]}],
         [{"text": "@tts1 Explain the weather patterns shown in this diagram.", "files": ["example_images/examples_weather_events.png"]}],
         label="Query Input",
         file_types=["image", ".mp4"],
         file_count="multiple",
+        placeholder="Type text and/or upload media. Use '@image' for image gen, '@tts1' or '@tts2' for TTS."
     ),
     stop_btn="Stop Generation",
     multimodal=True,