Spaces:

ariG23498
/

d-fine

Running

App Files Files Community

ariG23498 HF Staff commited on 4 days ago

Commit

5d67892

1 Parent(s): 65d0271

adding video logic

Browse files

Files changed (6) hide show

.gitattributes +1 -0
.gitignore +3 -0
README.md +1 -1
app.py +423 -126
requirements.txt +4 -1
video.mp4 +3 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.jpg filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.jpg filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+.ruff_cache
+.venv
+static

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: D Fine
 emoji: 🌖
 colorFrom: red
 colorTo: indigo

 ---
+title: Real Time Object Detection wtih D-Fine
 emoji: 🌖
 colorFrom: red
 colorTo: indigo

app.py CHANGED Viewed

@@ -1,45 +1,103 @@
 import gradio as gr
 from transformers import pipeline
 from transformers.image_utils import load_image
-checkpoints = [
-    'ustc-community/dfine_n_coco',
-    'ustc-community/dfine_s_coco',
-    'ustc-community/dfine_m_coco',
-    'ustc-community/dfine_l_coco',
-    'ustc-community/dfine_x_coco',
-    'ustc-community/dfine_s_obj365',
-    'ustc-community/dfine_m_obj365',
-    'ustc-community/dfine_l_obj365',
-    'ustc-community/dfine_x_obj365',
-    'ustc-community/dfine_s_obj2coco',
-    'ustc-community/dfine_m_obj2coco',
-    'ustc-community/dfine_l_obj2coco_e25',
-    'ustc-community/dfine_x_obj2coco',
 ]
-def detect_objects(image, checkpoint, confidence_threshold=0.3, use_url=False, url=""):
-    pipe = pipeline(
-        "object-detection",
-        model=checkpoint,
-        image_processor=checkpoint,
-        device="cpu",
-    )
     if use_url and url:
-        input_image = load_image(url)
     elif image is not None:
         input_image = image
     else:
-        return None, gr.Markdown("**Error**: Please provide an image or URL.", visible=True)
-    # Run detection
-    results = pipe(input_image, threshold=confidence_threshold)
-    # Get image dimensions for validation
     img_width, img_height = input_image.size
-    # Prepare annotations in the format: list of (bounding_box, label)
     annotations = []
     for result in results:
         score = result["score"]
@@ -47,107 +105,315 @@ def detect_objects(image, checkpoint, confidence_threshold=0.3, use_url=False, u
             continue
         label = f"{result['label']} ({score:.2f})"
         box = result["box"]
-        # Validate and convert box to (x1, y1, x2, y2)
-        x1 = max(0, int(box["xmin"]))
-        y1 = max(0, int(box["ymin"]))
-        x2 = min(img_width, int(box["xmax"]))
-        y2 = min(img_height, int(box["ymax"]))
-        # Ensure valid box
-        if x2 <= x1 or y2 <= y1:
             continue
-        bounding_box = (x1, y1, x2, y2)
         annotations.append((bounding_box, label))
-    # Handle empty annotations
     if not annotations:
         return (input_image, []), gr.Markdown(
             "**Warning**: No objects detected above the confidence threshold. Try lowering the threshold.",
-            visible=True
         )
-    # Return base image and annotations
     return (input_image, annotations), gr.Markdown(visible=False)
 # Gradio interface
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
         # Real-Time Object Detection Demo
-        Experience state-of-the-art object detection with USTC's Dfine models. Upload an image, provide a URL, or try an example below. Select a model and adjust the confidence threshold to see detections in real time!
-        **Instructions**:
-        - Upload an image or enter a URL.
-        - Choose a model checkpoint from the dropdown.
-        - Adjust the confidence threshold (0.1 to 1.0).
-        - Click "Detect Objects" to view results, or select an example.
-        - Use "Clear" to reset inputs and outputs.
         """,
-        elem_classes="header-text"
     )
-    with gr.Row():
-        with gr.Column(scale=1, min_width=300):
-            with gr.Group():
-                image_input = gr.Image(
-                    label="Upload Image",
-                    type="pil",
-                    sources=["upload", "webcam"],
-                    interactive=True,
-                    elem_classes="input-component",
-                )
-                use_url = gr.Checkbox(label="Use Image URL Instead", value=False)
-                url_input = gr.Textbox(
-                    label="Image URL",
-                    placeholder="https://example.com/image.jpg",
-                    visible=False,
-                    elem_classes="input-component",
-                )
-                checkpoint = gr.Dropdown(
-                    choices=checkpoints,
-                    label="Select Model Checkpoint",
-                    value=checkpoints[0],
-                    elem_classes="input-component",
-                )
-                confidence_threshold = gr.Slider(
-                    minimum=0.1,
-                    maximum=1.0,
-                    value=0.3,
-                    step=0.1,
-                    label="Confidence Threshold",
-                    elem_classes="input-component",
-                )
-                with gr.Row():
-                    detect_button = gr.Button(
-                        "Detect Objects",
-                        variant="primary",
-                        elem_classes="action-button",
                     )
-                    clear_button = gr.Button(
-                        "Clear",
-                        variant="secondary",
-                        elem_classes="action-button",
                     )
-        with gr.Column(scale=2):
-            output_annotated = gr.AnnotatedImage(
-                label="Detection Results",
-                show_label=True,
-                color_map=None,  # Let Gradio assign colors
-                elem_classes="output-component",
             )
-            error_message = gr.Markdown(visible=False, elem_classes="error-text")
-    gr.Examples(
-        examples=[
-            ["./image.jpg", False, "", checkpoints[0], 0.3],
-            [None, True, "https://live.staticflickr.com/65535/33021460783_1646d43c54_b.jpg", checkpoints[0], 0.3],
-        ],
-        inputs=[image_input, use_url, url_input, checkpoint, confidence_threshold],
-        outputs=[output_annotated, error_message],
-        fn=detect_objects,
-        cache_examples=False,  # Avoid caching due to model size
-        label="Select an example to run the model",
-    )
     # Dynamic visibility for URL input
     use_url.change(
@@ -156,34 +422,65 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         outputs=url_input,
     )
-    # Clear button functionality
-    clear_button.click(
         fn=lambda: (
-            None,  # image_input
-            False,  # use_url
-            "",  # url_input
-            checkpoints[0],  # checkpoint
-            0.3,  # confidence_threshold
-            None,  # output_annotated
-            gr.Markdown(visible=False),  # error_message
         ),
         outputs=[
             image_input,
             use_url,
             url_input,
-            checkpoint,
-            confidence_threshold,
-            output_annotated,
-            error_message,
         ],
     )
-    # Detect button event
-    detect_button.click(
         fn=detect_objects,
-        inputs=[image_input, checkpoint, confidence_threshold, use_url, url_input],
-        outputs=[output_annotated, error_message],
     )
 if __name__ == "__main__":
-    demo.launch()

+import logging
+import os
+from typing import Tuple, List, Optional
+from pathlib import Path
+import shutil
+import tempfile
+import numpy as np
+import cv2
 import gradio as gr
+from PIL import Image
 from transformers import pipeline
 from transformers.image_utils import load_image
+import tqdm
+# Configuration constants
+CHECKPOINTS = [
+    "ustc-community/dfine_m_obj365",
+    "ustc-community/dfine_n_coco",
+    "ustc-community/dfine_s_coco",
+    "ustc-community/dfine_m_coco",
+    "ustc-community/dfine_l_coco",
+    "ustc-community/dfine_x_coco",
+    "ustc-community/dfine_s_obj365",
+    "ustc-community/dfine_l_obj365",
+    "ustc-community/dfine_x_obj365",
+    "ustc-community/dfine_s_obj2coco",
+    "ustc-community/dfine_m_obj2coco",
+    "ustc-community/dfine_l_obj2coco_e25",
+    "ustc-community/dfine_x_obj2coco",
 ]
+MAX_NUM_FRAMES = 300
+DEFAULT_CHECKPOINT = CHECKPOINTS[0]
+DEFAULT_CONFIDENCE_THRESHOLD = 0.3
+IMAGE_EXAMPLES = [
+    {"path": "./image.jpg", "use_url": False, "url": "", "label": "Local Image"},
+    {
+        "path": None,
+        "use_url": True,
+        "url": "https://live.staticflickr.com/65535/33021460783_1646d43c54_b.jpg",
+        "label": "Flickr Image",
+    },
+]
+VIDEO_EXAMPLES = [
+    {"path": "./video.mp4", "label": "Local Video"},
+]
+ALLOWED_VIDEO_EXTENSIONS = {".mp4", ".avi", ".mov"}
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+VIDEO_OUTPUT_DIR = Path("static/videos")
+VIDEO_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+def detect_objects(
+    image: Optional[Image.Image],
+    checkpoint: str,
+    confidence_threshold: float = DEFAULT_CONFIDENCE_THRESHOLD,
+    use_url: bool = False,
+    url: str = "",
+) -> Tuple[
+    Optional[Tuple[Image.Image, List[Tuple[Tuple[int, int, int, int], str]]]],
+    gr.Markdown,
+]:
     if use_url and url:
+        try:
+            input_image = load_image(url)
+        except Exception as e:
+            logger.error(f"Failed to load image from URL {url}: {str(e)}")
+            return None, gr.Markdown(
+                f"**Error**: Failed to load image from URL: {str(e)}", visible=True
+            )
     elif image is not None:
+        if not isinstance(image, Image.Image):
+            logger.error("Input image is not a PIL Image")
+            return None, gr.Markdown("**Error**: Invalid image format.", visible=True)
         input_image = image
     else:
+        return None, gr.Markdown(
+            "**Error**: Please provide an image or URL.", visible=True
+        )
+    try:
+        pipe = pipeline(
+            "object-detection",
+            model=checkpoint,
+            image_processor=checkpoint,
+            device="cpu",
+        )
+    except Exception as e:
+        logger.error(f"Failed to initialize model pipeline for {checkpoint}: {str(e)}")
+        return None, gr.Markdown(
+            f"**Error**: Failed to load model: {str(e)}", visible=True
+        )
+    results = pipe(input_image, threshold=confidence_threshold)
     img_width, img_height = input_image.size
     annotations = []
     for result in results:
         score = result["score"]
             continue
         label = f"{result['label']} ({score:.2f})"
         box = result["box"]
+        # Validate and convert box to (xmin, ymin, xmax, ymax)
+        bbox_xmin = max(0, int(box["xmin"]))
+        bbox_ymin = max(0, int(box["ymin"]))
+        bbox_xmax = min(img_width, int(box["xmax"]))
+        bbox_ymax = min(img_height, int(box["ymax"]))
+        if bbox_xmax <= bbox_xmin or bbox_ymax <= bbox_ymin:
             continue
+        bounding_box = (bbox_xmin, bbox_ymin, bbox_xmax, bbox_ymax)
         annotations.append((bounding_box, label))
     if not annotations:
         return (input_image, []), gr.Markdown(
             "**Warning**: No objects detected above the confidence threshold. Try lowering the threshold.",
+            visible=True,
         )
     return (input_image, annotations), gr.Markdown(visible=False)
+def annotate_frame(
+    image: Image.Image, annotations: List[Tuple[Tuple[int, int, int, int], str]]
+) -> np.ndarray:
+    image_np = np.array(image)
+    image_bgr = image_np[:, :, ::-1].copy()  # RGB to BGR
+    for (xmin, ymin, xmax, ymax), label in annotations:
+        cv2.rectangle(image_bgr, (xmin, ymin), (xmax, ymax), (255, 255, 255), 2)
+        text_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)[0]
+        cv2.rectangle(
+            image_bgr,
+            (xmin, ymin - text_size[1] - 4),
+            (xmin + text_size[0], ymin),
+            (255, 255, 255),
+            -1,
+        )
+        cv2.putText(
+            image_bgr,
+            label,
+            (xmin, ymin - 4),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.5,
+            (0, 0, 0),
+            1,
+        )
+    return image_bgr
+def process_video(
+    video_path: str,
+    checkpoint: str,
+    confidence_threshold: float = DEFAULT_CONFIDENCE_THRESHOLD,
+    progress: gr.Progress = gr.Progress(track_tqdm=True),
+) -> Tuple[Optional[str], gr.Markdown]:
+    if not video_path or not os.path.isfile(video_path):
+        logger.error(f"Invalid video path: {video_path}")
+        return None, gr.Markdown(
+            "**Error**: Please provide a valid video file.", visible=True
+        )
+    ext = os.path.splitext(video_path)[1].lower()
+    if ext not in ALLOWED_VIDEO_EXTENSIONS:
+        logger.error(f"Unsupported video format: {ext}")
+        return None, gr.Markdown(
+            f"**Error**: Unsupported video format. Use MP4, AVI, or MOV.", visible=True
+        )
+    try:
+        cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
+            logger.error(f"Failed to open video: {video_path}")
+            return None, gr.Markdown(
+                "**Error**: Failed to open video file.", visible=True
+            )
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        # Use H.264 codec for browser compatibility
+        fourcc = cv2.VideoWriter_fourcc(*"H264")
+        temp_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
+        writer = cv2.VideoWriter(temp_file.name, fourcc, fps, (width, height))
+        if not writer.isOpened():
+            logger.error("Failed to initialize video writer")
+            cap.release()
+            temp_file.close()
+            os.unlink(temp_file.name)
+            return None, gr.Markdown(
+                "**Error**: Failed to initialize video writer.", visible=True
+            )
+        frame_count = 0
+        for _ in tqdm.tqdm(
+            range(min(MAX_NUM_FRAMES, num_frames)), desc="Processing video"
+        ):
+            ok, frame = cap.read()
+            if not ok:
+                break
+            rgb_frame = frame[:, :, ::-1]  # BGR to RGB
+            pil_image = Image.fromarray(rgb_frame)
+            (annotated_image, annotations), _ = detect_objects(
+                pil_image, checkpoint, confidence_threshold, use_url=False, url=""
+            )
+            if annotated_image is None:
+                continue
+            annotated_frame = annotate_frame(annotated_image, annotations)
+            writer.write(annotated_frame)
+            frame_count += 1
+        writer.release()
+        cap.release()
+        if frame_count == 0:
+            logger.warning("No valid frames processed in video")
+            temp_file.close()
+            os.unlink(temp_file.name)
+            return None, gr.Markdown(
+                "**Warning**: No valid frames processed. Try a different video or threshold.",
+                visible=True,
+            )
+        temp_file.close()
+        # Copy to persistent directory for Gradio access
+        output_filename = f"output_{os.path.basename(temp_file.name)}"
+        output_path = VIDEO_OUTPUT_DIR / output_filename
+        shutil.copy(temp_file.name, output_path)
+        os.unlink(temp_file.name)  # Remove temporary file
+        logger.info(f"Video saved to {output_path}")
+        return str(output_path), gr.Markdown(visible=False)
+    except Exception as e:
+        logger.error(f"Video processing failed: {str(e)}")
+        if "temp_file" in locals():
+            temp_file.close()
+            if os.path.exists(temp_file.name):
+                os.unlink(temp_file.name)
+        return None, gr.Markdown(
+            f"**Error**: Video processing failed: {str(e)}", visible=True
+        )
+def create_image_inputs() -> List[gr.components.Component]:
+    return [
+        gr.Image(
+            label="Upload Image",
+            type="pil",
+            sources=["upload", "webcam"],
+            interactive=True,
+            elem_classes="input-component",
+        ),
+        gr.Checkbox(label="Use Image URL Instead", value=False),
+        gr.Textbox(
+            label="Image URL",
+            placeholder="https://example.com/image.jpg",
+            visible=False,
+            elem_classes="input-component",
+        ),
+        gr.Dropdown(
+            choices=CHECKPOINTS,
+            label="Select Model Checkpoint",
+            value=DEFAULT_CHECKPOINT,
+            elem_classes="input-component",
+        ),
+        gr.Slider(
+            minimum=0.1,
+            maximum=1.0,
+            value=DEFAULT_CONFIDENCE_THRESHOLD,
+            step=0.1,
+            label="Confidence Threshold",
+            elem_classes="input-component",
+        ),
+    ]
+def create_video_inputs() -> List[gr.components.Component]:
+    return [
+        gr.Video(
+            label="Upload Video",
+            sources=["upload"],
+            interactive=True,
+            format="mp4",  # Ensure MP4 format
+            elem_classes="input-component",
+        ),
+        gr.Dropdown(
+            choices=CHECKPOINTS,
+            label="Select Model Checkpoint",
+            value=DEFAULT_CHECKPOINT,
+            elem_classes="input-component",
+        ),
+        gr.Slider(
+            minimum=0.1,
+            maximum=1.0,
+            value=DEFAULT_CONFIDENCE_THRESHOLD,
+            step=0.1,
+            label="Confidence Threshold",
+            elem_classes="input-component",
+        ),
+    ]
+def create_button_row(is_image: bool) -> List[gr.Button]:
+    prefix = "Image" if is_image else "Video"
+    return [
+        gr.Button(
+            f"{prefix} Detect Objects", variant="primary", elem_classes="action-button"
+        ),
+        gr.Button(f"{prefix} Clear", variant="secondary", elem_classes="action-button"),
+    ]
 # Gradio interface
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
         # Real-Time Object Detection Demo
+        Experience state-of-the-art object detection with USTC's Dfine models. Upload an image or video,
+        provide a URL, or try an example below. Select a model and adjust the confidence threshold to see detections in real time!
         """,
+        elem_classes="header-text",
     )
+    with gr.Tabs():
+        with gr.Tab("Image"):
+            with gr.Row():
+                with gr.Column(scale=1, min_width=300):
+                    with gr.Group():
+                        (
+                            image_input,
+                            use_url,
+                            url_input,
+                            image_checkpoint,
+                            image_confidence_threshold,
+                        ) = create_image_inputs()
+                        image_detect_button, image_clear_button = create_button_row(
+                            is_image=True
+                        )
+                with gr.Column(scale=2):
+                    image_output = gr.AnnotatedImage(
+                        label="Detection Results",
+                        show_label=True,
+                        color_map=None,
+                        elem_classes="output-component",
                     )
+                    image_error_message = gr.Markdown(
+                        visible=False, elem_classes="error-text"
                     )
+            gr.Examples(
+                examples=[
+                    [
+                        example["path"],
+                        example["use_url"],
+                        example["url"],
+                        DEFAULT_CHECKPOINT,
+                        DEFAULT_CONFIDENCE_THRESHOLD,
+                    ]
+                    for example in IMAGE_EXAMPLES
+                ],
+                inputs=[
+                    image_input,
+                    use_url,
+                    url_input,
+                    image_checkpoint,
+                    image_confidence_threshold,
+                ],
+                outputs=[image_output, image_error_message],
+                fn=detect_objects,
+                cache_examples=False,
+                label="Select an image example to populate inputs",
+            )
+        with gr.Tab("Video"):
+            gr.Markdown(
+                f"The input video will be truncated to {MAX_NUM_FRAMES} frames."
+            )
+            with gr.Row():
+                with gr.Column(scale=1, min_width=300):
+                    with gr.Group():
+                        video_input, video_checkpoint, video_confidence_threshold = (
+                            create_video_inputs()
+                        )
+                        video_detect_button, video_clear_button = create_button_row(
+                            is_image=False
+                        )
+                with gr.Column(scale=2):
+                    video_output = gr.Video(
+                        label="Detection Results",
+                        format="mp4",  # Explicit MP4 format
+                        elem_classes="output-component",
+                    )
+                    video_error_message = gr.Markdown(
+                        visible=False, elem_classes="error-text"
+                    )
+            gr.Examples(
+                examples=[
+                    [example["path"], DEFAULT_CHECKPOINT, DEFAULT_CONFIDENCE_THRESHOLD]
+                    for example in VIDEO_EXAMPLES
+                ],
+                inputs=[video_input, video_checkpoint, video_confidence_threshold],
+                outputs=[video_output, video_error_message],
+                fn=process_video,
+                cache_examples=False,
+                label="Select a video example to populate inputs",
             )
     # Dynamic visibility for URL input
     use_url.change(
         outputs=url_input,
     )
+    # Image clear button
+    image_clear_button.click(
         fn=lambda: (
+            None,
+            False,
+            "",
+            DEFAULT_CHECKPOINT,
+            DEFAULT_CONFIDENCE_THRESHOLD,
+            None,
+            gr.Markdown(visible=False),
         ),
         outputs=[
             image_input,
             use_url,
             url_input,
+            image_checkpoint,
+            image_confidence_threshold,
+            image_output,
+            image_error_message,
+        ],
+    )
+    # Video clear button
+    video_clear_button.click(
+        fn=lambda: (
+            None,
+            DEFAULT_CHECKPOINT,
+            DEFAULT_CONFIDENCE_THRESHOLD,
+            None,
+            gr.Markdown(visible=False),
+        ),
+        outputs=[
+            video_input,
+            video_checkpoint,
+            video_confidence_threshold,
+            video_output,
+            video_error_message,
         ],
     )
+    # Image detect button
+    image_detect_button.click(
         fn=detect_objects,
+        inputs=[
+            image_input,
+            image_checkpoint,
+            image_confidence_threshold,
+            use_url,
+            url_input,
+        ],
+        outputs=[image_output, image_error_message],
+    )
+    # Video detect button
+    video_detect_button.click(
+        fn=process_video,
+        inputs=[video_input, video_checkpoint, video_confidence_threshold],
+        outputs=[video_output, video_error_message],
     )
 if __name__ == "__main__":
+    demo.queue(max_size=20).launch()

requirements.txt CHANGED Viewed

@@ -1,4 +1,7 @@
 gradio
 transformers @ git+https://github.com/huggingface/transformers
 torch
-torchvision

 gradio
 transformers @ git+https://github.com/huggingface/transformers
 torch
+torchvision
+opencv-python
+tqdm
+pillow

video.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:747f9c2f9d19e4955603e1a13b69663187882d4c6a8fbcad18ddbd04ee792d4d
+size 1972564