Upload 7 files
Browse files- .gitattributes +2 -0
- README.md +231 -14
- app.py +1240 -0
- detector.py +465 -0
- phi4_detector.py +500 -0
- requirements.txt +12 -0
- vid-1.mp4 +3 -0
- vid-2.mp4 +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
vid-1.mp4 filter=lfs diff=lfs merge=lfs -text
|
37 |
+
vid-2.mp4 filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,14 +1,231 @@
|
|
1 |
-
---
|
2 |
-
title: Video
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
-
sdk: streamlit
|
7 |
-
sdk_version: 1.
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license:
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Video Anomaly Detector
|
3 |
+
emoji: 🎥
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: green
|
6 |
+
sdk: streamlit
|
7 |
+
sdk_version: 1.31.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: mit
|
11 |
+
---
|
12 |
+
|
13 |
+
# Video Anomaly Detector
|
14 |
+
|
15 |
+
This application analyzes video files frame by frame using advanced AI models to detect anomalies based on a user-provided prompt.
|
16 |
+
|
17 |
+
## Model Description
|
18 |
+
|
19 |
+
The application supports multiple AI models for analysis:
|
20 |
+
|
21 |
+
- **GPT-4o**: OpenAI's most powerful multimodal model, offering the highest accuracy for anomaly detection
|
22 |
+
- **GPT-4o-mini**: A smaller, faster, and more cost-effective version of GPT-4o
|
23 |
+
- **Phi-4**: Microsoft's multimodal model that can run locally using Hugging Face transformers
|
24 |
+
- **Phi-3**: *(Coming soon)* Microsoft's earlier multimodal model
|
25 |
+
|
26 |
+
Each model can analyze both text and images, examining video frames to identify potential anomalies based on the user's prompt.
|
27 |
+
|
28 |
+
## Demo App
|
29 |
+
|
30 |
+
[](https://huggingface.co/spaces/username/video-anomaly-detector)
|
31 |
+
|
32 |
+
## Features
|
33 |
+
|
34 |
+
- Support for both video files and live streams (webcam, IP camera, RTSP)
|
35 |
+
- Select from multiple AI models (GPT-4o, GPT-4o-mini, Phi-4)
|
36 |
+
- Skip frames for faster processing
|
37 |
+
- Provide custom prompts for anomaly detection
|
38 |
+
- Two analysis modes: frame-by-frame or cumulative summary
|
39 |
+
- Batch processing for multiple videos
|
40 |
+
- Streamlit web interface with modern UI design
|
41 |
+
|
42 |
+
## How It Works
|
43 |
+
|
44 |
+
1. The application extracts frames from the uploaded video or live stream
|
45 |
+
2. It skips a user-defined number of frames to reduce processing time
|
46 |
+
3. Based on the selected analysis depth:
|
47 |
+
- **Granular mode**: Each selected frame is analyzed individually
|
48 |
+
- **Cumulative mode**: All frames are analyzed together to provide an overall summary
|
49 |
+
4. The selected AI model analyzes the frame(s) and provides descriptions of any detected anomalies
|
50 |
+
5. Results are displayed in an interactive interface with timestamps for live streams
|
51 |
+
|
52 |
+
## Requirements
|
53 |
+
|
54 |
+
- Python 3.8+
|
55 |
+
- OpenAI API key with access to GPT-4o and GPT-4o-mini models (only needed for OpenAI models)
|
56 |
+
- For Phi-4: GPU recommended but not required (will use CPU if GPU not available)
|
57 |
+
- For live streams: Webcam or access to an IP camera/RTSP stream
|
58 |
+
|
59 |
+
## Installation
|
60 |
+
|
61 |
+
```bash
|
62 |
+
git clone https://github.com/username/video-anomaly-detector.git
|
63 |
+
cd video-anomaly-detector
|
64 |
+
pip install -r requirements.txt
|
65 |
+
```
|
66 |
+
|
67 |
+
## Environment Variables
|
68 |
+
|
69 |
+
Create a `.env` file in the root directory with your OpenAI API key (only needed for OpenAI models):
|
70 |
+
|
71 |
+
```
|
72 |
+
OPENAI_API_KEY=your_openai_api_key_here
|
73 |
+
```
|
74 |
+
|
75 |
+
## Usage
|
76 |
+
|
77 |
+
### Web Application
|
78 |
+
|
79 |
+
Run the Streamlit application:
|
80 |
+
|
81 |
+
```bash
|
82 |
+
streamlit run app.py
|
83 |
+
```
|
84 |
+
|
85 |
+
Your browser will automatically open with the application running at http://localhost:8501
|
86 |
+
|
87 |
+
#### Using with Video Files
|
88 |
+
|
89 |
+
1. Select "Video File" as the input source
|
90 |
+
2. Upload a video file
|
91 |
+
3. Configure the analysis settings
|
92 |
+
4. Click "Analyze Video"
|
93 |
+
|
94 |
+
#### Using with Live Streams
|
95 |
+
|
96 |
+
1. Select "Live Stream" as the input source
|
97 |
+
2. Choose between "Webcam" or "IP Camera / RTSP Stream"
|
98 |
+
3. For IP cameras, enter the stream URL (e.g., rtsp://username:password@ip_address:port/path)
|
99 |
+
4. Set the maximum number of frames to process
|
100 |
+
5. Configure the analysis settings
|
101 |
+
6. Click "Analyze Video"
|
102 |
+
|
103 |
+
### Command Line
|
104 |
+
|
105 |
+
#### Single Video Processing
|
106 |
+
|
107 |
+
```bash
|
108 |
+
python example.py --video path/to/video.mp4 --skip 5 --analysis_depth granular --model gpt-4o --prompt "Detect any unusual activities or objects in this frame"
|
109 |
+
```
|
110 |
+
|
111 |
+
Arguments:
|
112 |
+
- `--video`: Path to the video file (required)
|
113 |
+
- `--skip`: Number of frames to skip (default: 5)
|
114 |
+
- `--analysis_depth`: Analysis depth: 'granular' or 'cumulative' (default: 'granular')
|
115 |
+
- `--model`: AI model to use: 'gpt-4o', 'gpt-4o-mini', or 'phi-4' (default: 'gpt-4o')
|
116 |
+
- `--prompt`: Prompt for anomaly detection
|
117 |
+
- `--api_key`: OpenAI API key (optional if set in .env file, not needed for Phi-4)
|
118 |
+
|
119 |
+
#### Live Stream Processing
|
120 |
+
|
121 |
+
```bash
|
122 |
+
python example.py --stream 0 --skip 5 --analysis_depth granular --model gpt-4o --max_frames 30 --prompt "Detect any unusual activities or objects in this frame"
|
123 |
+
```
|
124 |
+
|
125 |
+
Arguments:
|
126 |
+
- `--stream`: Stream source (0 for webcam, URL for IP camera/RTSP stream)
|
127 |
+
- `--max_frames`: Maximum number of frames to process (default: 30)
|
128 |
+
- Other arguments are the same as for video processing
|
129 |
+
|
130 |
+
#### Batch Processing
|
131 |
+
|
132 |
+
```bash
|
133 |
+
python batch_process.py --videos_dir path/to/videos --output_dir output --skip 5 --analysis_depth cumulative --model gpt-4o-mini
|
134 |
+
```
|
135 |
+
|
136 |
+
Arguments:
|
137 |
+
- `--videos_dir`: Directory containing video files (required)
|
138 |
+
- `--output_dir`: Directory to save results (default: 'output')
|
139 |
+
- `--skip`: Number of frames to skip (default: 5)
|
140 |
+
- `--analysis_depth`: Analysis depth: 'granular' or 'cumulative' (default: 'granular')
|
141 |
+
- `--model`: AI model to use: 'gpt-4o', 'gpt-4o-mini', or 'phi-4' (default: 'gpt-4o')
|
142 |
+
- `--prompt`: Prompt for anomaly detection
|
143 |
+
- `--api_key`: OpenAI API key (optional if set in .env file, not needed for Phi-4)
|
144 |
+
- `--extensions`: Comma-separated list of video file extensions (default: '.mp4,.avi,.mov,.mkv')
|
145 |
+
|
146 |
+
## Model Options
|
147 |
+
|
148 |
+
### GPT-4o
|
149 |
+
- OpenAI's most powerful multimodal model
|
150 |
+
- Highest accuracy for anomaly detection
|
151 |
+
- Requires OpenAI API key
|
152 |
+
- Recommended for critical applications where accuracy is paramount
|
153 |
+
|
154 |
+
### GPT-4o-mini
|
155 |
+
- Smaller, faster version of GPT-4o
|
156 |
+
- More cost-effective for processing large videos
|
157 |
+
- Requires OpenAI API key
|
158 |
+
- Good balance between performance and cost
|
159 |
+
|
160 |
+
### Phi-4
|
161 |
+
- Microsoft's multimodal model
|
162 |
+
- Runs locally using Hugging Face transformers
|
163 |
+
- No API key required
|
164 |
+
- First run will download the model (approximately 5GB)
|
165 |
+
- GPU recommended but not required
|
166 |
+
|
167 |
+
### Phi-3 (Coming Soon)
|
168 |
+
- Microsoft's earlier multimodal model
|
169 |
+
- Will provide an alternative option for analysis
|
170 |
+
|
171 |
+
## Analysis Depth Options
|
172 |
+
|
173 |
+
### Granular - Frame by Frame
|
174 |
+
- Analyzes each frame individually
|
175 |
+
- Provides detailed analysis for every processed frame
|
176 |
+
- Useful for detecting specific moments or events
|
177 |
+
|
178 |
+
### Cumulative - All Frames
|
179 |
+
- Analyzes all frames together to provide an overall summary
|
180 |
+
- Identifies up to 3 key frames that best represent detected anomalies
|
181 |
+
- Useful for getting a high-level understanding of anomalies in the video
|
182 |
+
|
183 |
+
## Deploying to Hugging Face Spaces
|
184 |
+
|
185 |
+
This project is configured for easy deployment to Hugging Face Spaces:
|
186 |
+
|
187 |
+
1. Fork this repository to your GitHub account
|
188 |
+
2. Create a new Space on Hugging Face: https://huggingface.co/spaces/create
|
189 |
+
3. Select "Streamlit" as the SDK
|
190 |
+
4. Link your GitHub repository
|
191 |
+
5. Add your OpenAI API key as a secret in the Space settings (if using OpenAI models)
|
192 |
+
6. The Space will automatically deploy with the configuration from this repository
|
193 |
+
|
194 |
+
Alternatively, you can use the GitHub Actions workflow to automatically sync your repository to Hugging Face Spaces:
|
195 |
+
|
196 |
+
1. Create a Hugging Face account and generate an access token
|
197 |
+
2. Add the following secrets to your GitHub repository:
|
198 |
+
- `HF_TOKEN`: Your Hugging Face access token
|
199 |
+
- `HF_USERNAME`: Your Hugging Face username
|
200 |
+
- `OPENAI_API_KEY`: Your OpenAI API key (if using OpenAI models)
|
201 |
+
3. Push to the main branch to trigger the workflow
|
202 |
+
|
203 |
+
## Project Structure
|
204 |
+
|
205 |
+
```
|
206 |
+
video-anomaly-detector/
|
207 |
+
├── app.py # Streamlit web application
|
208 |
+
├── detector.py # Core video processing and anomaly detection with OpenAI models
|
209 |
+
├── phi4_detector.py # Phi-4 model implementation using Hugging Face
|
210 |
+
├── example.py # Example script for processing a single video
|
211 |
+
├── batch_process.py # Script for batch processing multiple videos
|
212 |
+
├── requirements.txt # Python dependencies
|
213 |
+
├── requirements-hf.txt # Dependencies for Hugging Face Spaces
|
214 |
+
├── .env.example # Template for environment variables
|
215 |
+
└── .github/ # GitHub Actions workflows
|
216 |
+
└── workflows/
|
217 |
+
└── sync-to-hub.yml # Workflow to sync to Hugging Face
|
218 |
+
```
|
219 |
+
|
220 |
+
## Limitations
|
221 |
+
|
222 |
+
- Processing time depends on the video length, frame skip rate, and your internet connection
|
223 |
+
- The OpenAI models require an API key and may incur usage costs
|
224 |
+
- Phi-4 model requires downloading approximately 5GB of model files on first use
|
225 |
+
- Higher frame skip values will process fewer frames, making analysis faster but potentially less accurate
|
226 |
+
- Cumulative analysis may miss some details that would be caught in granular analysis
|
227 |
+
- Live stream processing may be affected by network latency and camera quality
|
228 |
+
|
229 |
+
## License
|
230 |
+
|
231 |
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
app.py
ADDED
@@ -0,0 +1,1240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
---
|
3 |
+
title: Video Anomaly Detector
|
4 |
+
emoji: 🎥
|
5 |
+
colorFrom: blue
|
6 |
+
colorTo: green
|
7 |
+
sdk: streamlit
|
8 |
+
sdk_version: 1.31.0
|
9 |
+
app_file: app.py
|
10 |
+
pinned: false
|
11 |
+
license: mit
|
12 |
+
---
|
13 |
+
"""
|
14 |
+
import streamlit as st
|
15 |
+
import os
|
16 |
+
import tempfile
|
17 |
+
import time
|
18 |
+
from detector import VideoAnomalyDetector
|
19 |
+
import cv2
|
20 |
+
from PIL import Image
|
21 |
+
import numpy as np
|
22 |
+
from dotenv import load_dotenv
|
23 |
+
import streamlit.components.v1 as components
|
24 |
+
import json
|
25 |
+
import base64
|
26 |
+
from io import BytesIO
|
27 |
+
import smtplib
|
28 |
+
from email.mime.text import MIMEText
|
29 |
+
from email.mime.multipart import MIMEMultipart
|
30 |
+
from email.mime.image import MIMEImage
|
31 |
+
import requests
|
32 |
+
import re
|
33 |
+
|
34 |
+
# Custom JSON encoder to handle numpy arrays and other non-serializable types
|
35 |
+
class NumpyEncoder(json.JSONEncoder):
|
36 |
+
def default(self, obj):
|
37 |
+
if isinstance(obj, np.ndarray):
|
38 |
+
# Convert numpy arrays to base64 encoded strings
|
39 |
+
pil_img = Image.fromarray(obj)
|
40 |
+
buffered = BytesIO()
|
41 |
+
pil_img.save(buffered, format="PNG")
|
42 |
+
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
43 |
+
return {"__ndarray__": img_str}
|
44 |
+
return super(NumpyEncoder, self).default(obj)
|
45 |
+
|
46 |
+
|
47 |
+
|
48 |
+
|
49 |
+
def send_email_notification(to_email, subject, body, image=None):
|
50 |
+
"""Send email notification with optional image attachment"""
|
51 |
+
try:
|
52 |
+
# Get email credentials from environment variables
|
53 |
+
smtp_server = os.getenv("SMTP_SERVER", "smtp.gmail.com")
|
54 |
+
smtp_port = int(os.getenv("SMTP_PORT", "587"))
|
55 |
+
smtp_username = os.getenv("SMTP_USERNAME")
|
56 |
+
smtp_password = os.getenv("SMTP_PASSWORD")
|
57 |
+
|
58 |
+
if not smtp_username or not smtp_password:
|
59 |
+
st.warning("Email notification failed: SMTP credentials not configured. Please set SMTP_USERNAME and SMTP_PASSWORD environment variables.")
|
60 |
+
return False
|
61 |
+
|
62 |
+
# Create message
|
63 |
+
msg = MIMEMultipart()
|
64 |
+
msg['From'] = smtp_username
|
65 |
+
msg['To'] = to_email
|
66 |
+
msg['Subject'] = subject
|
67 |
+
|
68 |
+
# Attach text
|
69 |
+
msg.attach(MIMEText(body, 'plain'))
|
70 |
+
|
71 |
+
# Attach image if provided
|
72 |
+
if image is not None:
|
73 |
+
# Convert numpy array to image
|
74 |
+
if isinstance(image, np.ndarray):
|
75 |
+
pil_img = Image.fromarray(image)
|
76 |
+
img_byte_arr = BytesIO()
|
77 |
+
pil_img.save(img_byte_arr, format='PNG')
|
78 |
+
img_data = img_byte_arr.getvalue()
|
79 |
+
else:
|
80 |
+
# Assume it's already bytes
|
81 |
+
img_data = image
|
82 |
+
|
83 |
+
img_attachment = MIMEImage(img_data)
|
84 |
+
img_attachment.add_header('Content-Disposition', 'attachment', filename='anomaly.png')
|
85 |
+
msg.attach(img_attachment)
|
86 |
+
|
87 |
+
# Connect to server and send
|
88 |
+
server = smtplib.SMTP(smtp_server, smtp_port)
|
89 |
+
server.starttls()
|
90 |
+
server.login(smtp_username, smtp_password)
|
91 |
+
server.send_message(msg)
|
92 |
+
server.quit()
|
93 |
+
|
94 |
+
return True
|
95 |
+
except Exception as e:
|
96 |
+
st.warning(f"Email notification failed: {str(e)}")
|
97 |
+
return False
|
98 |
+
|
99 |
+
def send_whatsapp_notification(to_number, message):
|
100 |
+
"""Send WhatsApp notification using WhatsApp Business API"""
|
101 |
+
try:
|
102 |
+
# Get WhatsApp API credentials from environment variables
|
103 |
+
whatsapp_api_key = os.getenv("WHATSAPP_API_KEY")
|
104 |
+
whatsapp_phone_id = os.getenv("WHATSAPP_PHONE_ID")
|
105 |
+
|
106 |
+
if not whatsapp_api_key or not whatsapp_phone_id:
|
107 |
+
st.warning("WhatsApp notification failed: API credentials not configured. Please set WHATSAPP_API_KEY and WHATSAPP_PHONE_ID environment variables.")
|
108 |
+
return False
|
109 |
+
|
110 |
+
# For demonstration purposes, we'll show how to use the WhatsApp Business API
|
111 |
+
# In a real implementation, you would need to set up a WhatsApp Business account
|
112 |
+
# and use their official API
|
113 |
+
|
114 |
+
# Example using WhatsApp Business API
|
115 |
+
url = f"https://graph.facebook.com/v17.0/{whatsapp_phone_id}/messages"
|
116 |
+
headers = {
|
117 |
+
"Authorization": f"Bearer {whatsapp_api_key}",
|
118 |
+
"Content-Type": "application/json"
|
119 |
+
}
|
120 |
+
data = {
|
121 |
+
"messaging_product": "whatsapp",
|
122 |
+
"to": to_number,
|
123 |
+
"type": "text",
|
124 |
+
"text": {
|
125 |
+
"body": message
|
126 |
+
}
|
127 |
+
}
|
128 |
+
|
129 |
+
# For demonstration, we'll just log the request instead of actually sending it
|
130 |
+
print(f"Would send WhatsApp message to {to_number}: {message}")
|
131 |
+
|
132 |
+
# In a real implementation, you would uncomment this:
|
133 |
+
# response = requests.post(url, headers=headers, json=data)
|
134 |
+
# return response.status_code == 200
|
135 |
+
|
136 |
+
return True
|
137 |
+
except Exception as e:
|
138 |
+
st.warning(f"WhatsApp notification failed: {str(e)}")
|
139 |
+
return False
|
140 |
+
|
141 |
+
# Helper functions for notifications
|
142 |
+
def validate_email(email):
|
143 |
+
"""Validate email format"""
|
144 |
+
pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$'
|
145 |
+
return re.match(pattern, email) is not None
|
146 |
+
|
147 |
+
def validate_phone(phone):
|
148 |
+
"""Validate phone number format (should include country code)"""
|
149 |
+
pattern = r'^\+\d{1,3}\d{6,14}$'
|
150 |
+
return re.match(pattern, phone) is not None
|
151 |
+
|
152 |
+
def send_notification(notification_type, contact, message, image=None):
|
153 |
+
"""Send notification based on type"""
|
154 |
+
if notification_type == "email":
|
155 |
+
if validate_email(contact):
|
156 |
+
return send_email_notification(
|
157 |
+
contact,
|
158 |
+
"Anomaly Detected - Video Anomaly Detector",
|
159 |
+
message,
|
160 |
+
image
|
161 |
+
)
|
162 |
+
else:
|
163 |
+
st.warning("Invalid email format. Notification not sent.")
|
164 |
+
return False
|
165 |
+
elif notification_type == "whatsapp":
|
166 |
+
if validate_phone(contact):
|
167 |
+
return send_whatsapp_notification(contact, message)
|
168 |
+
else:
|
169 |
+
st.warning("Invalid phone number format. Please include country code (e.g., +1234567890). Notification not sent.")
|
170 |
+
return False
|
171 |
+
return False
|
172 |
+
|
173 |
+
# Helper functions for displaying results
|
174 |
+
def display_single_result(result):
|
175 |
+
"""Display a single analysis result"""
|
176 |
+
if isinstance(result, dict):
|
177 |
+
# This is a single frame result or cumulative result
|
178 |
+
if "anomaly_detected" in result:
|
179 |
+
# Create columns for image and text
|
180 |
+
if "frame" in result:
|
181 |
+
col1, col2 = st.columns([1, 2])
|
182 |
+
with col1:
|
183 |
+
st.image(result["frame"], caption="Captured Frame", use_column_width=True)
|
184 |
+
|
185 |
+
with col2:
|
186 |
+
anomaly_detected = result["anomaly_detected"]
|
187 |
+
|
188 |
+
# Start building the HTML content
|
189 |
+
html_content = f"""
|
190 |
+
<div class='result-details'>
|
191 |
+
"""
|
192 |
+
|
193 |
+
# Add confidence if available
|
194 |
+
if "confidence" in result:
|
195 |
+
html_content += f"<p><strong>Confidence:</strong> {result['confidence']}%</p>"
|
196 |
+
|
197 |
+
# Add analysis/text if available (check multiple possible keys)
|
198 |
+
analysis_text = None
|
199 |
+
for key in ["analysis", "text", "description"]:
|
200 |
+
if key in result and result[key]:
|
201 |
+
analysis_text = result[key]
|
202 |
+
break
|
203 |
+
|
204 |
+
if analysis_text:
|
205 |
+
html_content += f"<p><strong>Analysis:</strong> {analysis_text}</p>"
|
206 |
+
|
207 |
+
# Add anomaly type if available
|
208 |
+
if "anomaly_type" in result and result["anomaly_type"]:
|
209 |
+
html_content += f"<p><strong>Anomaly Type:</strong> {result['anomaly_type']}</p>"
|
210 |
+
|
211 |
+
# Close the div
|
212 |
+
html_content += "</div>"
|
213 |
+
|
214 |
+
# Display the HTML content
|
215 |
+
st.markdown(html_content, unsafe_allow_html=True)
|
216 |
+
else:
|
217 |
+
# No frame available, just show the text
|
218 |
+
# Start building the HTML content
|
219 |
+
html_content = "<div class='result-details'>"
|
220 |
+
|
221 |
+
# Add confidence if available
|
222 |
+
if "confidence" in result:
|
223 |
+
html_content += f"<p><strong>Confidence:</strong> {result['confidence']}%</p>"
|
224 |
+
|
225 |
+
# Add analysis/text if available (check multiple possible keys)
|
226 |
+
analysis_text = None
|
227 |
+
for key in ["analysis", "text", "description"]:
|
228 |
+
if key in result and result[key]:
|
229 |
+
analysis_text = result[key]
|
230 |
+
break
|
231 |
+
|
232 |
+
if analysis_text:
|
233 |
+
html_content += f"<p><strong>Analysis:</strong> {analysis_text}</p>"
|
234 |
+
|
235 |
+
# Add anomaly type if available
|
236 |
+
if "anomaly_type" in result and result["anomaly_type"]:
|
237 |
+
html_content += f"<p><strong>Anomaly Type:</strong> {result['anomaly_type']}</p>"
|
238 |
+
|
239 |
+
# Close the div
|
240 |
+
html_content += "</div>"
|
241 |
+
|
242 |
+
# Display the HTML content
|
243 |
+
st.markdown(html_content, unsafe_allow_html=True)
|
244 |
+
else:
|
245 |
+
# Display other types of results
|
246 |
+
st.json(result)
|
247 |
+
else:
|
248 |
+
# Unknown result type
|
249 |
+
st.write(result)
|
250 |
+
|
251 |
+
def display_results(results, analysis_depth):
|
252 |
+
"""Display analysis results based on analysis depth"""
|
253 |
+
if not results:
|
254 |
+
st.warning("No results to display")
|
255 |
+
return
|
256 |
+
|
257 |
+
# Add a main results header
|
258 |
+
st.markdown("<h2 class='section-header'>📊 Analysis Results</h2>", unsafe_allow_html=True)
|
259 |
+
|
260 |
+
# Add high-level summary at the top
|
261 |
+
if analysis_depth == "granular":
|
262 |
+
# For granular analysis, check if any frame has an anomaly
|
263 |
+
anomaly_frames = sum(1 for r in results if r.get("anomaly_detected", False))
|
264 |
+
total_frames = len(results)
|
265 |
+
|
266 |
+
if anomaly_frames > 0:
|
267 |
+
# Get the anomaly types from frames with anomalies
|
268 |
+
anomaly_types = set(r.get("anomaly_type", "Unknown") for r in results if r.get("anomaly_detected", False))
|
269 |
+
anomaly_types_str = ", ".join(anomaly_types)
|
270 |
+
|
271 |
+
st.markdown(
|
272 |
+
f"""
|
273 |
+
<div class='result-box anomaly'>
|
274 |
+
<h3>⚠️ ANOMALY DETECTED</h3>
|
275 |
+
<p><strong>Frames with anomalies:</strong> {anomaly_frames} out of {total_frames}</p>
|
276 |
+
<p><strong>Anomaly types:</strong> {anomaly_types_str}</p>
|
277 |
+
</div>
|
278 |
+
""",
|
279 |
+
unsafe_allow_html=True
|
280 |
+
)
|
281 |
+
else:
|
282 |
+
st.markdown(
|
283 |
+
"""
|
284 |
+
<div class='result-box normal'>
|
285 |
+
<h3>✅ No Anomalies Detected</h3>
|
286 |
+
<p>No anomalies were detected in any of the analyzed frames.</p>
|
287 |
+
</div>
|
288 |
+
""",
|
289 |
+
unsafe_allow_html=True
|
290 |
+
)
|
291 |
+
else: # cumulative
|
292 |
+
# For cumulative analysis, check the overall result
|
293 |
+
if results.get("anomaly_detected", False):
|
294 |
+
anomaly_type = results.get("anomaly_type", "Unknown")
|
295 |
+
st.markdown(
|
296 |
+
f"""
|
297 |
+
<div class='result-box anomaly'>
|
298 |
+
<h3>⚠️ ANOMALY DETECTED</h3>
|
299 |
+
<p><strong>Anomaly type:</strong> {anomaly_type}</p>
|
300 |
+
</div>
|
301 |
+
""",
|
302 |
+
unsafe_allow_html=True
|
303 |
+
)
|
304 |
+
else:
|
305 |
+
st.markdown(
|
306 |
+
"""
|
307 |
+
<div class='result-box normal'>
|
308 |
+
<h3>✅ No Anomalies Detected</h3>
|
309 |
+
<p>No anomalies were detected in the video.</p>
|
310 |
+
</div>
|
311 |
+
""",
|
312 |
+
unsafe_allow_html=True
|
313 |
+
)
|
314 |
+
|
315 |
+
# Display detailed results
|
316 |
+
if analysis_depth == "granular":
|
317 |
+
# For granular analysis, results is a list of frame analyses
|
318 |
+
st.markdown("<h3 class='sub-header'>🔍 Frame-by-Frame Analysis</h3>", unsafe_allow_html=True)
|
319 |
+
|
320 |
+
# Display detailed view directly without tabs
|
321 |
+
for i, result in enumerate(results):
|
322 |
+
with st.expander(f"Frame {i+1} - {'⚠️ ANOMALY' if result.get('anomaly_detected', False) else '✅ Normal'}"):
|
323 |
+
display_single_result(result)
|
324 |
+
|
325 |
+
else: # cumulative
|
326 |
+
st.markdown("<h3 class='sub-header'>🔍 Overall Video Analysis</h3>", unsafe_allow_html=True)
|
327 |
+
display_single_result(results)
|
328 |
+
|
329 |
+
# Display key frames if available
|
330 |
+
if "frames" in results and results["frames"]:
|
331 |
+
st.markdown("<h3 class='sub-header'>🖼️ Key Frames</h3>", unsafe_allow_html=True)
|
332 |
+
|
333 |
+
# Create a row of columns for the frames
|
334 |
+
num_frames = len(results["frames"])
|
335 |
+
cols = st.columns(min(3, num_frames))
|
336 |
+
|
337 |
+
# Display each frame in a column
|
338 |
+
for i, (col, frame) in enumerate(zip(cols, results["frames"])):
|
339 |
+
with col:
|
340 |
+
st.image(frame, caption=f"Key Frame {i+1}", use_column_width=True)
|
341 |
+
|
342 |
+
# Initialize session state for stop button
|
343 |
+
if 'stop_requested' not in st.session_state:
|
344 |
+
st.session_state.stop_requested = False
|
345 |
+
|
346 |
+
def request_stop():
|
347 |
+
st.session_state.stop_requested = True
|
348 |
+
|
349 |
+
# Conditionally import Phi-4 detector
|
350 |
+
try:
|
351 |
+
from phi4_detector import Phi4AnomalyDetector
|
352 |
+
PHI4_AVAILABLE = True
|
353 |
+
except ImportError:
|
354 |
+
PHI4_AVAILABLE = False
|
355 |
+
|
356 |
+
# Load environment variables from .env file
|
357 |
+
load_dotenv()
|
358 |
+
|
359 |
+
# Set page configuration
|
360 |
+
st.set_page_config(
|
361 |
+
page_title="Video Anomaly Detector",
|
362 |
+
page_icon="🔍",
|
363 |
+
layout="wide"
|
364 |
+
)
|
365 |
+
|
366 |
+
# Custom CSS for better UI
|
367 |
+
st.markdown("""
|
368 |
+
<style>
|
369 |
+
@import url('https://fonts.googleapis.com/css2?family=Poppins:wght@300;400;500;600;700&display=swap');
|
370 |
+
|
371 |
+
html, body, [class*="css"] {
|
372 |
+
font-family: 'Poppins', sans-serif;
|
373 |
+
}
|
374 |
+
|
375 |
+
.main-header {
|
376 |
+
font-size: 2.8rem;
|
377 |
+
font-weight: 700;
|
378 |
+
color: #5046E5;
|
379 |
+
text-align: center;
|
380 |
+
margin-bottom: 1rem;
|
381 |
+
padding-top: 1.5rem;
|
382 |
+
}
|
383 |
+
|
384 |
+
.sub-header {
|
385 |
+
font-size: 1.8rem;
|
386 |
+
font-weight: 600;
|
387 |
+
color: #36B37E;
|
388 |
+
margin-bottom: 1.2rem;
|
389 |
+
}
|
390 |
+
|
391 |
+
.section-header {
|
392 |
+
font-size: 2rem;
|
393 |
+
font-weight: 600;
|
394 |
+
color: #5046E5;
|
395 |
+
margin-top: 2rem;
|
396 |
+
margin-bottom: 1rem;
|
397 |
+
}
|
398 |
+
|
399 |
+
.result-box {
|
400 |
+
padding: 15px;
|
401 |
+
border-radius: 10px;
|
402 |
+
margin-bottom: 15px;
|
403 |
+
}
|
404 |
+
|
405 |
+
.result-box.anomaly {
|
406 |
+
background-color: rgba(255, 76, 76, 0.1);
|
407 |
+
border: 1px solid rgba(255, 76, 76, 0.3);
|
408 |
+
}
|
409 |
+
|
410 |
+
.result-box.normal {
|
411 |
+
background-color: rgba(54, 179, 126, 0.1);
|
412 |
+
border: 1px solid rgba(54, 179, 126, 0.3);
|
413 |
+
}
|
414 |
+
|
415 |
+
.result-box h3 {
|
416 |
+
margin-top: 0;
|
417 |
+
margin-bottom: 10px;
|
418 |
+
}
|
419 |
+
|
420 |
+
.result-box.anomaly h3 {
|
421 |
+
color: #FF4C4C;
|
422 |
+
}
|
423 |
+
|
424 |
+
.result-box.normal h3 {
|
425 |
+
color: #36B37E;
|
426 |
+
}
|
427 |
+
|
428 |
+
.result-container {
|
429 |
+
background-color: #f8f9fa;
|
430 |
+
padding: 1.8rem;
|
431 |
+
border-radius: 12px;
|
432 |
+
margin-bottom: 1.5rem;
|
433 |
+
border: 1px solid #e9ecef;
|
434 |
+
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.05);
|
435 |
+
}
|
436 |
+
|
437 |
+
.stProgress > div > div > div {
|
438 |
+
background-color: #5046E5;
|
439 |
+
}
|
440 |
+
|
441 |
+
.stButton>button {
|
442 |
+
background-color: #5046E5;
|
443 |
+
color: white;
|
444 |
+
font-weight: 600;
|
445 |
+
border-radius: 8px;
|
446 |
+
padding: 0.5rem 1rem;
|
447 |
+
border: none;
|
448 |
+
}
|
449 |
+
|
450 |
+
.stButton>button:hover {
|
451 |
+
background-color: #4038C7;
|
452 |
+
}
|
453 |
+
|
454 |
+
.stSelectbox>div>div {
|
455 |
+
background-color: #f8f9fa;
|
456 |
+
border-radius: 8px;
|
457 |
+
}
|
458 |
+
|
459 |
+
.stRadio>div {
|
460 |
+
padding: 10px;
|
461 |
+
background-color: #f8f9fa;
|
462 |
+
border-radius: 8px;
|
463 |
+
}
|
464 |
+
|
465 |
+
.stExpander>div {
|
466 |
+
border-radius: 8px;
|
467 |
+
border: 1px solid #e9ecef;
|
468 |
+
}
|
469 |
+
|
470 |
+
.model-info {
|
471 |
+
font-size: 0.9rem;
|
472 |
+
color: #6c757d;
|
473 |
+
font-style: italic;
|
474 |
+
margin-top: 0.5rem;
|
475 |
+
}
|
476 |
+
|
477 |
+
.icon-text {
|
478 |
+
display: flex;
|
479 |
+
align-items: center;
|
480 |
+
gap: 0.5rem;
|
481 |
+
}
|
482 |
+
|
483 |
+
.footer {
|
484 |
+
text-align: center;
|
485 |
+
color: #6c757d;
|
486 |
+
font-size: 0.9rem;
|
487 |
+
margin-top: 2rem;
|
488 |
+
}
|
489 |
+
|
490 |
+
.anomaly-true {
|
491 |
+
color: #dc3545;
|
492 |
+
font-weight: bold;
|
493 |
+
}
|
494 |
+
|
495 |
+
.anomaly-false {
|
496 |
+
color: #28a745;
|
497 |
+
font-weight: bold;
|
498 |
+
}
|
499 |
+
|
500 |
+
.anomaly-type {
|
501 |
+
font-weight: bold;
|
502 |
+
margin-top: 0.5rem;
|
503 |
+
}
|
504 |
+
|
505 |
+
.anomaly-box {
|
506 |
+
padding: 1rem;
|
507 |
+
border-radius: 8px;
|
508 |
+
margin-bottom: 1rem;
|
509 |
+
}
|
510 |
+
|
511 |
+
.anomaly-box-true {
|
512 |
+
background-color: rgba(220, 53, 69, 0.1);
|
513 |
+
border: 1px solid rgba(220, 53, 69, 0.3);
|
514 |
+
}
|
515 |
+
|
516 |
+
.anomaly-box-false {
|
517 |
+
background-color: rgba(40, 167, 69, 0.1);
|
518 |
+
border: 1px solid rgba(40, 167, 69, 0.3);
|
519 |
+
}
|
520 |
+
|
521 |
+
.instructions-container {
|
522 |
+
font-size: 1.1rem;
|
523 |
+
line-height: 1.8;
|
524 |
+
}
|
525 |
+
|
526 |
+
.instructions-container ol {
|
527 |
+
padding-left: 1.5rem;
|
528 |
+
}
|
529 |
+
|
530 |
+
.instructions-container ul {
|
531 |
+
padding-left: 1.5rem;
|
532 |
+
}
|
533 |
+
|
534 |
+
.instructions-container li {
|
535 |
+
margin-bottom: 0.5rem;
|
536 |
+
}
|
537 |
+
|
538 |
+
.live-stream-container {
|
539 |
+
border: 2px solid #5046E5;
|
540 |
+
border-radius: 12px;
|
541 |
+
padding: 1rem;
|
542 |
+
margin-top: 1rem;
|
543 |
+
}
|
544 |
+
|
545 |
+
.result-details {
|
546 |
+
padding: 15px;
|
547 |
+
border-radius: 10px;
|
548 |
+
margin-bottom: 15px;
|
549 |
+
background-color: rgba(80, 70, 229, 0.05);
|
550 |
+
border: 1px solid rgba(80, 70, 229, 0.2);
|
551 |
+
}
|
552 |
+
|
553 |
+
.result-details p {
|
554 |
+
margin-bottom: 10px;
|
555 |
+
}
|
556 |
+
|
557 |
+
.result-details strong {
|
558 |
+
color: #5046E5;
|
559 |
+
}
|
560 |
+
|
561 |
+
.video-preview-container {
|
562 |
+
border: 1px solid #e9ecef;
|
563 |
+
border-radius: 10px;
|
564 |
+
padding: 15px;
|
565 |
+
margin-bottom: 20px;
|
566 |
+
background-color: rgba(80, 70, 229, 0.03);
|
567 |
+
}
|
568 |
+
|
569 |
+
.video-preview-container video {
|
570 |
+
width: 100%;
|
571 |
+
border-radius: 8px;
|
572 |
+
margin-bottom: 10px;
|
573 |
+
}
|
574 |
+
|
575 |
+
.video-info {
|
576 |
+
display: flex;
|
577 |
+
justify-content: space-between;
|
578 |
+
margin-top: 10px;
|
579 |
+
}
|
580 |
+
|
581 |
+
.video-info-item {
|
582 |
+
text-align: center;
|
583 |
+
padding: 8px;
|
584 |
+
background-color: #f8f9fa;
|
585 |
+
border-radius: 5px;
|
586 |
+
flex: 1;
|
587 |
+
margin: 0 5px;
|
588 |
+
}
|
589 |
+
</style>
|
590 |
+
""", unsafe_allow_html=True)
|
591 |
+
|
592 |
+
# Header with icon
|
593 |
+
st.markdown("<h1 class='main-header'>🔍 Video Anomaly Detector</h1>", unsafe_allow_html=True)
|
594 |
+
st.markdown("<p style='text-align: center; font-size: 1.2rem; margin-bottom: 2rem;'>Analyze video frames for anomalies using advanced AI models</p>", unsafe_allow_html=True)
|
595 |
+
|
596 |
+
# Sidebar for inputs
|
597 |
+
with st.sidebar:
|
598 |
+
st.markdown("<h2 class='sub-header'>⚙️ Settings</h2>", unsafe_allow_html=True)
|
599 |
+
|
600 |
+
# Input source selection
|
601 |
+
st.markdown("<div class='icon-text'><span>📹</span><span>Input Source</span></div>", unsafe_allow_html=True)
|
602 |
+
input_source = st.radio(
|
603 |
+
"",
|
604 |
+
["Video File", "Live Stream"],
|
605 |
+
index=0,
|
606 |
+
help="Select the input source for analysis"
|
607 |
+
)
|
608 |
+
|
609 |
+
# File uploader or stream URL based on input source
|
610 |
+
if input_source == "Video File":
|
611 |
+
st.markdown("<div class='icon-text'><span>📁</span><span>Upload Video</span></div>", unsafe_allow_html=True)
|
612 |
+
|
613 |
+
# Find sample .mp4 files in the current directory
|
614 |
+
sample_files = []
|
615 |
+
for file in os.listdir():
|
616 |
+
if file.endswith('.mp4'):
|
617 |
+
sample_files.append(file)
|
618 |
+
|
619 |
+
# Show sample files if available
|
620 |
+
if sample_files:
|
621 |
+
st.info(f"Sample videos available: {', '.join(sample_files)}")
|
622 |
+
use_sample = st.checkbox("Use a sample video instead of uploading")
|
623 |
+
|
624 |
+
if use_sample:
|
625 |
+
selected_sample = st.selectbox("Select a sample video", sample_files)
|
626 |
+
uploaded_file = selected_sample # We'll handle this specially later
|
627 |
+
|
628 |
+
# Add video preview section
|
629 |
+
st.markdown("<h3 class='sub-header'>🎬 Video Preview</h3>", unsafe_allow_html=True)
|
630 |
+
|
631 |
+
# Create a container for the video preview with custom styling
|
632 |
+
st.markdown("<div class='video-preview-container'>", unsafe_allow_html=True)
|
633 |
+
|
634 |
+
# Get the full path to the selected sample video
|
635 |
+
video_path = os.path.join(os.getcwd(), selected_sample)
|
636 |
+
|
637 |
+
# Display the video player
|
638 |
+
st.video(video_path)
|
639 |
+
|
640 |
+
# Display video information
|
641 |
+
try:
|
642 |
+
cap = cv2.VideoCapture(video_path)
|
643 |
+
if cap.isOpened():
|
644 |
+
# Get video properties
|
645 |
+
fps = cap.get(cv2.CAP_PROP_FPS)
|
646 |
+
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
647 |
+
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
648 |
+
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
649 |
+
|
650 |
+
# Calculate duration
|
651 |
+
duration = frame_count / fps if fps > 0 else 0
|
652 |
+
|
653 |
+
# Format duration as minutes:seconds
|
654 |
+
minutes = int(duration // 60)
|
655 |
+
seconds = int(duration % 60)
|
656 |
+
duration_str = f"{minutes}:{seconds:02d}"
|
657 |
+
|
658 |
+
|
659 |
+
cap.release()
|
660 |
+
except Exception as e:
|
661 |
+
st.warning(f"Could not read video properties: {str(e)}")
|
662 |
+
|
663 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
664 |
+
else:
|
665 |
+
uploaded_file = st.file_uploader("", type=["mp4", "avi", "mov", "mkv"])
|
666 |
+
else:
|
667 |
+
uploaded_file = st.file_uploader("", type=["mp4", "avi", "mov", "mkv"])
|
668 |
+
|
669 |
+
stream_source = None
|
670 |
+
else: # Live Stream
|
671 |
+
st.markdown("<div class='icon-text'><span>🔗</span><span>Stream Source</span></div>", unsafe_allow_html=True)
|
672 |
+
stream_options = ["Webcam", "IP Camera / RTSP Stream"]
|
673 |
+
stream_type = st.selectbox("", stream_options, index=0)
|
674 |
+
|
675 |
+
if stream_type == "Webcam":
|
676 |
+
stream_source = 0 # Default webcam
|
677 |
+
else:
|
678 |
+
stream_source = st.text_input("Stream URL", placeholder="rtsp://username:password@ip_address:port/path")
|
679 |
+
|
680 |
+
# Max frames to process for live stream
|
681 |
+
st.markdown("<div class='icon-text'><span>🔢</span><span>Frame Capture Settings</span></div>", unsafe_allow_html=True)
|
682 |
+
|
683 |
+
capture_mode = st.radio(
|
684 |
+
"Capture Mode",
|
685 |
+
["Frame Count Limit", "Time Interval (Continuous)"],
|
686 |
+
index=0,
|
687 |
+
help="Choose how to capture frames from the live stream"
|
688 |
+
)
|
689 |
+
|
690 |
+
if capture_mode == "Frame Count Limit":
|
691 |
+
max_frames = st.number_input(
|
692 |
+
"Maximum Frames",
|
693 |
+
min_value=1,
|
694 |
+
max_value=100,
|
695 |
+
value=30,
|
696 |
+
help="Maximum number of frames to process from the live stream"
|
697 |
+
)
|
698 |
+
time_interval = None
|
699 |
+
else: # Time Interval mode
|
700 |
+
max_frames = None # No frame limit in time interval mode
|
701 |
+
time_interval = st.number_input(
|
702 |
+
"Seconds Between Captures",
|
703 |
+
min_value=1,
|
704 |
+
max_value=60,
|
705 |
+
value=5,
|
706 |
+
help="Capture one frame every X seconds indefinitely"
|
707 |
+
)
|
708 |
+
st.info("⚠️ In time interval mode, processing will continue indefinitely. Use the Stop button to end capture.")
|
709 |
+
|
710 |
+
uploaded_file = None
|
711 |
+
|
712 |
+
# Model selection
|
713 |
+
st.markdown("<div class='icon-text'><span>🧠</span><span>AI Model</span></div>", unsafe_allow_html=True)
|
714 |
+
|
715 |
+
# Add Phi-4 to the model options if available
|
716 |
+
model_options = ["GPT-4o", "GPT-4o-mini"]
|
717 |
+
if PHI4_AVAILABLE:
|
718 |
+
model_options.append("Phi-4")
|
719 |
+
model_options.append("Phi-3 (Coming Soon)")
|
720 |
+
|
721 |
+
model = st.selectbox(
|
722 |
+
"",
|
723 |
+
model_options,
|
724 |
+
index=0,
|
725 |
+
help="Select the AI model to use for analysis"
|
726 |
+
)
|
727 |
+
|
728 |
+
# Display model info based on selection
|
729 |
+
if model == "GPT-4o":
|
730 |
+
st.markdown("<div class='model-info'>Most powerful model with highest accuracy</div>", unsafe_allow_html=True)
|
731 |
+
model_value = "gpt-4o"
|
732 |
+
use_phi4 = False
|
733 |
+
elif model == "GPT-4o-mini":
|
734 |
+
st.markdown("<div class='model-info'>Faster and more cost-effective</div>", unsafe_allow_html=True)
|
735 |
+
model_value = "gpt-4o-mini"
|
736 |
+
use_phi4 = False
|
737 |
+
elif model == "Phi-4":
|
738 |
+
st.markdown("<div class='model-info'>Microsoft's multimodal model, runs locally</div>", unsafe_allow_html=True)
|
739 |
+
model_value = "phi-4"
|
740 |
+
use_phi4 = True
|
741 |
+
else: # Phi-3
|
742 |
+
st.markdown("<div class='model-info'>Not yet implemented</div>", unsafe_allow_html=True)
|
743 |
+
model_value = "gpt-4o" # Default to GPT-4o if Phi-3 is selected
|
744 |
+
use_phi4 = False
|
745 |
+
st.warning("Phi-3 support is coming soon. Using GPT-4o instead.")
|
746 |
+
|
747 |
+
# Skip frames input with icon
|
748 |
+
st.markdown("<div class='icon-text'><span>⏭️</span><span>Frame Skip Rate</span></div>", unsafe_allow_html=True)
|
749 |
+
skip_frames = st.number_input(
|
750 |
+
"",
|
751 |
+
min_value=0,
|
752 |
+
max_value=100,
|
753 |
+
value=5,
|
754 |
+
help="Higher values process fewer frames, making analysis faster but potentially less accurate"
|
755 |
+
)
|
756 |
+
|
757 |
+
# Analysis depth selection
|
758 |
+
st.markdown("<div class='icon-text'><span>🔬</span><span>Analysis Depth</span></div>", unsafe_allow_html=True)
|
759 |
+
analysis_depth = st.radio(
|
760 |
+
"",
|
761 |
+
["Granular (Frame by Frame)", "Cumulative (Overall)"],
|
762 |
+
index=0,
|
763 |
+
help="Granular provides analysis for each frame, Cumulative gives an overall assessment"
|
764 |
+
)
|
765 |
+
|
766 |
+
# Map the radio button value to the actual value
|
767 |
+
analysis_depth_value = "granular" if analysis_depth == "Granular (Frame by Frame)" else "cumulative"
|
768 |
+
|
769 |
+
# Notification options
|
770 |
+
st.markdown("<div class='icon-text'><span>🔔</span><span>Notifications</span></div>", unsafe_allow_html=True)
|
771 |
+
enable_notifications = st.checkbox("Enable notifications for anomaly detection", value=False)
|
772 |
+
|
773 |
+
if enable_notifications:
|
774 |
+
notification_type = st.radio(
|
775 |
+
"Notification Method",
|
776 |
+
["Email", "WhatsApp"],
|
777 |
+
index=0,
|
778 |
+
help="Select how you want to be notified when anomalies are detected"
|
779 |
+
)
|
780 |
+
|
781 |
+
if notification_type == "Email":
|
782 |
+
notification_email = st.text_input(
|
783 |
+
"Email Address",
|
784 |
+
placeholder="[email protected]",
|
785 |
+
help="Enter the email address to receive notifications"
|
786 |
+
)
|
787 |
+
st.session_state.notification_contact = notification_email if notification_email else None
|
788 |
+
st.session_state.notification_type = "email" if notification_email else None
|
789 |
+
|
790 |
+
else: # WhatsApp
|
791 |
+
notification_phone = st.text_input(
|
792 |
+
"WhatsApp Number",
|
793 |
+
placeholder="+1234567890 (include country code)",
|
794 |
+
help="Enter your WhatsApp number with country code"
|
795 |
+
)
|
796 |
+
st.session_state.notification_contact = notification_phone if notification_phone else None
|
797 |
+
st.session_state.notification_type = "whatsapp" if notification_phone else None
|
798 |
+
else:
|
799 |
+
st.session_state.notification_type = None
|
800 |
+
st.session_state.notification_contact = None
|
801 |
+
|
802 |
+
# Prompt input with icon
|
803 |
+
st.markdown("<div class='icon-text'><span>💬</span><span>Anomaly Description</span></div>", unsafe_allow_html=True)
|
804 |
+
prompt = st.text_area(
|
805 |
+
"",
|
806 |
+
value="Analyze this frame and describe if there are any unusual or anomalous activities or objects. If you detect anything unusual, explain what it is and why it might be considered an anomaly.",
|
807 |
+
height=150,
|
808 |
+
help="Describe what kind of anomaly to look for"
|
809 |
+
)
|
810 |
+
|
811 |
+
# API key input with default from environment variable and icon (only show for OpenAI models)
|
812 |
+
if not use_phi4:
|
813 |
+
st.markdown("<div class='icon-text'><span>🔑</span><span>OpenAI API Key</span></div>", unsafe_allow_html=True)
|
814 |
+
default_api_key = os.getenv("OPENAI_API_KEY", "")
|
815 |
+
api_key = st.text_input(
|
816 |
+
"",
|
817 |
+
value=default_api_key,
|
818 |
+
type="password",
|
819 |
+
help="Your OpenAI API key with access to the selected model"
|
820 |
+
)
|
821 |
+
else:
|
822 |
+
# For Phi-4, we don't need an API key
|
823 |
+
api_key = "not-needed-for-phi4"
|
824 |
+
|
825 |
+
# Submit button with icon
|
826 |
+
submit_button = st.button("���� Analyze Video")
|
827 |
+
|
828 |
+
# Main content area for video file
|
829 |
+
if input_source == "Video File" and uploaded_file is not None:
|
830 |
+
# Display video info
|
831 |
+
st.markdown("<h2 class='sub-header'>📊 Video Information</h2>", unsafe_allow_html=True)
|
832 |
+
|
833 |
+
# Check if we're using a sample file or an uploaded file
|
834 |
+
if isinstance(uploaded_file, str) and os.path.exists(uploaded_file):
|
835 |
+
# This is a sample file from the directory
|
836 |
+
video_path = uploaded_file
|
837 |
+
st.success(f"Using sample video: {os.path.basename(video_path)}")
|
838 |
+
else:
|
839 |
+
# This is an uploaded file
|
840 |
+
# Save uploaded file to a temporary file
|
841 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmp_file:
|
842 |
+
tmp_file.write(uploaded_file.getvalue())
|
843 |
+
video_path = tmp_file.name
|
844 |
+
|
845 |
+
# Get video metadata
|
846 |
+
# For video files, use the default backend instead of DirectShow
|
847 |
+
cap = cv2.VideoCapture(video_path)
|
848 |
+
|
849 |
+
# Don't set MJPG format for video files as it can interfere with proper decoding
|
850 |
+
# cap.set(cv2.CAP_PROP_FOURCC, cv2.VideoWriter_fourcc('M','J','P','G'))
|
851 |
+
|
852 |
+
# Try to get video properties
|
853 |
+
fps = cap.get(cv2.CAP_PROP_FPS)
|
854 |
+
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
855 |
+
|
856 |
+
# Prevent division by zero, but only show warning for live streams
|
857 |
+
# For video files, this is likely an actual error
|
858 |
+
if fps <= 0:
|
859 |
+
# Check if this is a video file (not a webcam/stream)
|
860 |
+
if isinstance(video_path, str) and os.path.exists(video_path):
|
861 |
+
# This is a file that exists but has FPS issues
|
862 |
+
fps = 30.0 # Use a default value
|
863 |
+
st.warning(f"Could not determine frame rate for video file: {os.path.basename(video_path)}. Using default value of 30 FPS.")
|
864 |
+
else:
|
865 |
+
# This is likely a webcam or stream
|
866 |
+
fps = 30.0
|
867 |
+
st.info("Using default frame rate of 30 FPS for live stream.")
|
868 |
+
|
869 |
+
duration = frame_count / fps
|
870 |
+
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
871 |
+
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
872 |
+
cap.release()
|
873 |
+
|
874 |
+
# Display video metadata in a nicer format
|
875 |
+
col1, col2, col3 = st.columns(3)
|
876 |
+
with col1:
|
877 |
+
st.markdown("<div style='text-align: center;'>⏱️</div>", unsafe_allow_html=True)
|
878 |
+
st.metric("Duration", f"{duration:.2f} seconds")
|
879 |
+
with col2:
|
880 |
+
st.markdown("<div style='text-align: center;'>🎞️</div>", unsafe_allow_html=True)
|
881 |
+
st.metric("Total Frames", frame_count)
|
882 |
+
with col3:
|
883 |
+
st.markdown("<div style='text-align: center;'>📐</div>", unsafe_allow_html=True)
|
884 |
+
st.metric("Resolution", f"{width}x{height}")
|
885 |
+
|
886 |
+
# Display estimated frames to process
|
887 |
+
estimated_frames = frame_count // (skip_frames + 1) + 1
|
888 |
+
st.info(f"With current settings, approximately {estimated_frames} frames will be processed.")
|
889 |
+
|
890 |
+
# Main content area for live stream
|
891 |
+
elif input_source == "Live Stream" and stream_source is not None:
|
892 |
+
# Display live stream info
|
893 |
+
st.markdown("<h2 class='sub-header'>📊 Live Stream Information</h2>", unsafe_allow_html=True)
|
894 |
+
|
895 |
+
# Display stream source info
|
896 |
+
if stream_source == 0:
|
897 |
+
st.info("Using default webcam as the stream source.")
|
898 |
+
else:
|
899 |
+
st.info(f"Using stream URL: {stream_source}")
|
900 |
+
|
901 |
+
# Display estimated frames to process
|
902 |
+
st.info(f"Will process up to {max_frames} frames with a skip rate of {skip_frames}.")
|
903 |
+
|
904 |
+
# Show a placeholder for the live stream
|
905 |
+
st.markdown("<div class='live-stream-container'><p style='text-align: center;'>Live stream preview will appear here during processing</p></div>", unsafe_allow_html=True)
|
906 |
+
|
907 |
+
# Process video or stream when submit button is clicked
|
908 |
+
if submit_button:
|
909 |
+
if not api_key and not use_phi4:
|
910 |
+
st.error("⚠️ Please enter your OpenAI API key")
|
911 |
+
elif input_source == "Video File" and uploaded_file is None:
|
912 |
+
st.error("⚠️ Please upload a video file")
|
913 |
+
elif input_source == "Live Stream" and stream_source is None:
|
914 |
+
st.error("⚠️ Please provide a valid stream source")
|
915 |
+
else:
|
916 |
+
try:
|
917 |
+
# Initialize detector based on selected model
|
918 |
+
if use_phi4:
|
919 |
+
with st.spinner("Loading Phi-4 model... This may take a while if downloading for the first time."):
|
920 |
+
detector = Phi4AnomalyDetector()
|
921 |
+
st.success("Phi-4 model loaded successfully!")
|
922 |
+
else:
|
923 |
+
detector = VideoAnomalyDetector(api_key, model_value)
|
924 |
+
|
925 |
+
# Progress bar and status
|
926 |
+
st.markdown("<h2 class='sub-header'>⏳ Processing Video</h2>", unsafe_allow_html=True)
|
927 |
+
progress_bar = st.progress(0)
|
928 |
+
status_text = st.empty()
|
929 |
+
|
930 |
+
# Create a callback function to update progress
|
931 |
+
def update_progress(current, total):
|
932 |
+
if total == -1:
|
933 |
+
# Continuous mode
|
934 |
+
status_text.text(f"Processed {current} frames (continuous mode)...")
|
935 |
+
else:
|
936 |
+
# Normal mode with a known total
|
937 |
+
if total > 0:
|
938 |
+
progress = current / total
|
939 |
+
progress_bar.progress(progress)
|
940 |
+
else:
|
941 |
+
# Handle case where total is zero
|
942 |
+
progress_bar.progress(0)
|
943 |
+
status_text.text(f"Processing frame {current+1} of {total if total > 0 else '?'}...")
|
944 |
+
|
945 |
+
# Process the video or stream
|
946 |
+
start_time = time.time()
|
947 |
+
|
948 |
+
if input_source == "Video File":
|
949 |
+
results = detector.process_video(video_path, skip_frames, prompt, analysis_depth_value, update_progress)
|
950 |
+
print(f"Results: {results}")
|
951 |
+
# Results will be displayed after processing
|
952 |
+
|
953 |
+
else: # Live Stream
|
954 |
+
if capture_mode == "Frame Count Limit":
|
955 |
+
# Process with frame count limit (original behavior)
|
956 |
+
results = detector.process_live_stream(stream_source, skip_frames, prompt, analysis_depth_value, max_frames, update_progress)
|
957 |
+
# Results will be displayed after processing
|
958 |
+
|
959 |
+
else: # Time Interval mode
|
960 |
+
# Create a placeholder for continuous results
|
961 |
+
results_container = st.empty()
|
962 |
+
|
963 |
+
# Reset stop request flag at the beginning of processing
|
964 |
+
st.session_state.stop_requested = False
|
965 |
+
|
966 |
+
# Create a stop button outside the loop
|
967 |
+
st.button("Stop Capture", key="stop_continuous_main", on_click=request_stop)
|
968 |
+
|
969 |
+
# Process with time interval (generator mode)
|
970 |
+
results_generator = detector.process_live_stream(
|
971 |
+
stream_source, skip_frames, prompt, analysis_depth_value,
|
972 |
+
None, update_progress, time_interval
|
973 |
+
)
|
974 |
+
|
975 |
+
# Collect results for cumulative analysis if needed
|
976 |
+
all_results = []
|
977 |
+
frame_counter = 0
|
978 |
+
|
979 |
+
try:
|
980 |
+
# Process results as they come in
|
981 |
+
for result in results_generator:
|
982 |
+
# Check if stop button was pressed
|
983 |
+
if st.session_state.stop_requested:
|
984 |
+
st.success("Capture stopped by user")
|
985 |
+
break
|
986 |
+
|
987 |
+
frame_counter += 1
|
988 |
+
all_results.append(result)
|
989 |
+
|
990 |
+
# Display the latest result
|
991 |
+
with results_container.container():
|
992 |
+
if analysis_depth_value == "granular":
|
993 |
+
# For granular analysis, show the latest frame result
|
994 |
+
st.markdown(f"### Frame {frame_counter}")
|
995 |
+
display_single_result(result)
|
996 |
+
|
997 |
+
# Send notification if anomaly detected and notifications are enabled
|
998 |
+
if result.get("anomaly_detected", False) and st.session_state.notification_type and st.session_state.notification_contact:
|
999 |
+
# Create notification message
|
1000 |
+
anomaly_type = result.get("anomaly_type", "Unknown")
|
1001 |
+
anomaly_message = f"Anomaly detected in live stream (Frame {frame_counter}).\n"
|
1002 |
+
anomaly_message += f"Anomaly type: {anomaly_type}\n\n"
|
1003 |
+
|
1004 |
+
# Add analysis details
|
1005 |
+
analysis_text = None
|
1006 |
+
for key in ["analysis", "text", "description"]:
|
1007 |
+
if key in result and result[key]:
|
1008 |
+
analysis_text = result[key]
|
1009 |
+
break
|
1010 |
+
|
1011 |
+
if analysis_text:
|
1012 |
+
anomaly_message += f"Analysis: {analysis_text[:500]}..."
|
1013 |
+
|
1014 |
+
# Send notification
|
1015 |
+
with st.spinner("Sending notification about detected anomaly..."):
|
1016 |
+
notification_sent = send_notification(
|
1017 |
+
st.session_state.notification_type,
|
1018 |
+
st.session_state.notification_contact,
|
1019 |
+
anomaly_message,
|
1020 |
+
result.get("frame")
|
1021 |
+
)
|
1022 |
+
|
1023 |
+
if notification_sent:
|
1024 |
+
st.success(f"Notification sent to {st.session_state.notification_contact} via {st.session_state.notification_type.capitalize()}")
|
1025 |
+
else:
|
1026 |
+
st.error(f"Failed to send notification. Please check your {st.session_state.notification_type} settings.")
|
1027 |
+
else:
|
1028 |
+
# For cumulative analysis, we get periodic updates
|
1029 |
+
st.markdown(f"### Cumulative Analysis (Updated)")
|
1030 |
+
display_single_result(result)
|
1031 |
+
|
1032 |
+
# Send notification if anomaly detected and notifications are enabled
|
1033 |
+
if result.get("anomaly_detected", False) and st.session_state.notification_type and st.session_state.notification_contact:
|
1034 |
+
# Create notification message
|
1035 |
+
anomaly_type = result.get("anomaly_type", "Unknown")
|
1036 |
+
anomaly_message = f"Anomaly detected in live stream (Cumulative Analysis).\n"
|
1037 |
+
anomaly_message += f"Anomaly type: {anomaly_type}\n\n"
|
1038 |
+
|
1039 |
+
# Add analysis details
|
1040 |
+
analysis_text = None
|
1041 |
+
for key in ["analysis", "text", "description"]:
|
1042 |
+
if key in result and result[key]:
|
1043 |
+
analysis_text = result[key]
|
1044 |
+
break
|
1045 |
+
|
1046 |
+
if analysis_text:
|
1047 |
+
anomaly_message += f"Analysis: {analysis_text[:500]}..."
|
1048 |
+
|
1049 |
+
# Get a frame for the notification if available
|
1050 |
+
anomaly_image = None
|
1051 |
+
if "frames" in result and result["frames"]:
|
1052 |
+
anomaly_image = result["frames"][0]
|
1053 |
+
|
1054 |
+
# Send notification
|
1055 |
+
with st.spinner("Sending notification about detected anomaly..."):
|
1056 |
+
notification_sent = send_notification(
|
1057 |
+
st.session_state.notification_type,
|
1058 |
+
st.session_state.notification_contact,
|
1059 |
+
anomaly_message,
|
1060 |
+
anomaly_image
|
1061 |
+
)
|
1062 |
+
|
1063 |
+
if notification_sent:
|
1064 |
+
st.success(f"Notification sent to {st.session_state.notification_contact} via {st.session_state.notification_type.capitalize()}")
|
1065 |
+
else:
|
1066 |
+
st.error(f"Failed to send notification. Please check your {st.session_state.notification_type} settings.")
|
1067 |
+
|
1068 |
+
# Sleep briefly to allow UI updates
|
1069 |
+
time.sleep(0.1)
|
1070 |
+
except StopIteration:
|
1071 |
+
if not st.session_state.stop_requested:
|
1072 |
+
st.info("Stream ended")
|
1073 |
+
|
1074 |
+
# Final results
|
1075 |
+
if analysis_depth_value == "granular":
|
1076 |
+
results = all_results
|
1077 |
+
else:
|
1078 |
+
results = all_results[-1] if all_results else None
|
1079 |
+
|
1080 |
+
end_time = time.time()
|
1081 |
+
|
1082 |
+
# Calculate processing time
|
1083 |
+
processing_time = end_time - start_time
|
1084 |
+
st.success(f"Processing completed in {processing_time:.2f} seconds")
|
1085 |
+
|
1086 |
+
# Check if notifications are enabled and if anomalies were detected
|
1087 |
+
if st.session_state.notification_type and st.session_state.notification_contact:
|
1088 |
+
# Check if anomalies were detected
|
1089 |
+
anomalies_detected = False
|
1090 |
+
anomaly_image = None
|
1091 |
+
anomaly_message = ""
|
1092 |
+
|
1093 |
+
if analysis_depth_value == "granular":
|
1094 |
+
# For granular analysis, check if any frame has an anomaly
|
1095 |
+
anomaly_frames = [r for r in results if r.get("anomaly_detected", False)]
|
1096 |
+
if anomaly_frames:
|
1097 |
+
anomalies_detected = True
|
1098 |
+
# Get the first anomaly frame for the notification
|
1099 |
+
first_anomaly = anomaly_frames[0]
|
1100 |
+
anomaly_image = first_anomaly.get("frame")
|
1101 |
+
|
1102 |
+
# Create notification message
|
1103 |
+
anomaly_types = set(r.get("anomaly_type", "Unknown") for r in anomaly_frames)
|
1104 |
+
anomaly_message = f"Anomaly detected in {len(anomaly_frames)} out of {len(results)} frames.\n"
|
1105 |
+
anomaly_message += f"Anomaly types: {', '.join(anomaly_types)}\n\n"
|
1106 |
+
|
1107 |
+
# Add details of the first anomaly
|
1108 |
+
analysis_text = None
|
1109 |
+
for key in ["analysis", "text", "description"]:
|
1110 |
+
if key in first_anomaly and first_anomaly[key]:
|
1111 |
+
analysis_text = first_anomaly[key]
|
1112 |
+
break
|
1113 |
+
|
1114 |
+
if analysis_text:
|
1115 |
+
anomaly_message += f"Analysis of first anomaly: {analysis_text[:500]}..."
|
1116 |
+
else:
|
1117 |
+
# For cumulative analysis, check the overall result
|
1118 |
+
if results.get("anomaly_detected", False):
|
1119 |
+
anomalies_detected = True
|
1120 |
+
|
1121 |
+
# Get a frame for the notification if available
|
1122 |
+
if "frames" in results and results["frames"]:
|
1123 |
+
anomaly_image = results["frames"][0]
|
1124 |
+
|
1125 |
+
# Create notification message
|
1126 |
+
anomaly_type = results.get("anomaly_type", "Unknown")
|
1127 |
+
anomaly_message = f"Anomaly detected in video analysis.\n"
|
1128 |
+
anomaly_message += f"Anomaly type: {anomaly_type}\n\n"
|
1129 |
+
|
1130 |
+
# Add analysis details
|
1131 |
+
analysis_text = None
|
1132 |
+
for key in ["analysis", "text", "description"]:
|
1133 |
+
if key in results and results[key]:
|
1134 |
+
analysis_text = results[key]
|
1135 |
+
break
|
1136 |
+
|
1137 |
+
if analysis_text:
|
1138 |
+
anomaly_message += f"Analysis: {analysis_text[:500]}..."
|
1139 |
+
|
1140 |
+
# Send notification if anomalies were detected
|
1141 |
+
if anomalies_detected:
|
1142 |
+
with st.spinner("Sending notification about detected anomalies..."):
|
1143 |
+
notification_sent = send_notification(
|
1144 |
+
st.session_state.notification_type,
|
1145 |
+
st.session_state.notification_contact,
|
1146 |
+
anomaly_message,
|
1147 |
+
anomaly_image
|
1148 |
+
)
|
1149 |
+
|
1150 |
+
if notification_sent:
|
1151 |
+
st.success(f"Notification sent to {st.session_state.notification_contact} via {st.session_state.notification_type.capitalize()}")
|
1152 |
+
else:
|
1153 |
+
st.error(f"Failed to send notification. Please check your {st.session_state.notification_type} settings.")
|
1154 |
+
|
1155 |
+
# Only display results here if we're not in time interval mode
|
1156 |
+
# (time interval mode displays results as they come in)
|
1157 |
+
if not (input_source == "Live Stream" and capture_mode == "Time Interval (Continuous)"):
|
1158 |
+
# Display the results without an additional header
|
1159 |
+
display_results(results, analysis_depth_value)
|
1160 |
+
|
1161 |
+
# Download results button
|
1162 |
+
if results:
|
1163 |
+
try:
|
1164 |
+
# Convert results to JSON using our custom encoder
|
1165 |
+
results_json = json.dumps(results, indent=2, cls=NumpyEncoder)
|
1166 |
+
|
1167 |
+
# Create a download button
|
1168 |
+
st.download_button(
|
1169 |
+
label="Download Results as JSON",
|
1170 |
+
data=results_json,
|
1171 |
+
file_name="anomaly_detection_results.json",
|
1172 |
+
mime="application/json"
|
1173 |
+
)
|
1174 |
+
except Exception as e:
|
1175 |
+
st.warning(f"Could not create downloadable results: {str(e)}")
|
1176 |
+
st.info("This is usually due to large image data in the results. The analysis is still valid.")
|
1177 |
+
|
1178 |
+
# Clean up the temporary file if using a video file
|
1179 |
+
if input_source == "Video File" and 'video_path' in locals():
|
1180 |
+
# Only delete the file if it's a temporary file, not a sample file
|
1181 |
+
if not isinstance(uploaded_file, str):
|
1182 |
+
os.unlink(video_path)
|
1183 |
+
|
1184 |
+
except Exception as e:
|
1185 |
+
st.error(f"⚠️ An error occurred: {str(e)}")
|
1186 |
+
if input_source == "Video File" and 'video_path' in locals():
|
1187 |
+
# Only delete the file if it's a temporary file, not a sample file
|
1188 |
+
if not isinstance(uploaded_file, str):
|
1189 |
+
os.unlink(video_path)
|
1190 |
+
|
1191 |
+
# Instructions when no file is uploaded or stream is selected
|
1192 |
+
if (input_source == "Video File" and uploaded_file is None) or (input_source == "Live Stream" and stream_source is None) or not submit_button:
|
1193 |
+
# Using HTML component to properly render the HTML
|
1194 |
+
model_options_html = ""
|
1195 |
+
if PHI4_AVAILABLE:
|
1196 |
+
model_options_html += "<li><strong>Phi-4</strong> - Microsoft's multimodal model, runs locally</li>"
|
1197 |
+
|
1198 |
+
instructions_html = f"""
|
1199 |
+
<div class="result-container instructions-container">
|
1200 |
+
<h2 style="color: #5046E5;">📝 How to use this application</h2>
|
1201 |
+
|
1202 |
+
<ol>
|
1203 |
+
<li><strong>Select an input source</strong>:
|
1204 |
+
<ul>
|
1205 |
+
<li><strong>Video File</strong> - Upload a video file for analysis</li>
|
1206 |
+
<li><strong>Live Stream</strong> - Connect to a webcam or IP camera stream</li>
|
1207 |
+
</ul>
|
1208 |
+
</li>
|
1209 |
+
<li><strong>Select an AI model</strong> for analysis:
|
1210 |
+
<ul>
|
1211 |
+
<li><strong>GPT-4o-mini</strong> - Faster and more cost-effective</li>
|
1212 |
+
<li><strong>GPT-4o</strong> - Most powerful model with highest accuracy</li>
|
1213 |
+
{model_options_html}
|
1214 |
+
</ul>
|
1215 |
+
</li>
|
1216 |
+
<li><strong>Set the number of frames to skip</strong> - higher values process fewer frames</li>
|
1217 |
+
<li><strong>Choose an analysis depth</strong>:
|
1218 |
+
<ul>
|
1219 |
+
<li><strong>Granular</strong> - Analyzes each frame individually</li>
|
1220 |
+
<li><strong>Cumulative</strong> - Provides an overall summary with key frames</li>
|
1221 |
+
</ul>
|
1222 |
+
</li>
|
1223 |
+
<li><strong>Enter a prompt</strong> describing what anomaly to look for</li>
|
1224 |
+
<li><strong>Enter your OpenAI API key</strong> with access to the selected model (not needed for Phi-4)</li>
|
1225 |
+
<li><strong>Click "Analyze Video"</strong> to start processing</li>
|
1226 |
+
</ol>
|
1227 |
+
|
1228 |
+
<p>The application will extract frames from your video or stream, analyze them using the selected AI model, and display the results with clear indicators for detected anomalies.</p>
|
1229 |
+
</div>
|
1230 |
+
"""
|
1231 |
+
components.html(instructions_html, height=500)
|
1232 |
+
|
1233 |
+
# Footer
|
1234 |
+
st.markdown("---")
|
1235 |
+
st.markdown("<div class='footer'>Powered by OpenAI's GPT-4o, GPT-4o-mini, and Microsoft's Phi-4 models | © 2023 Video Anomaly Detector</div>", unsafe_allow_html=True)
|
1236 |
+
|
1237 |
+
|
1238 |
+
|
1239 |
+
|
1240 |
+
|
detector.py
ADDED
@@ -0,0 +1,465 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import numpy as np
|
3 |
+
import base64
|
4 |
+
import os
|
5 |
+
import tempfile
|
6 |
+
from openai import OpenAI
|
7 |
+
from PIL import Image
|
8 |
+
import io
|
9 |
+
import re
|
10 |
+
import time
|
11 |
+
|
12 |
+
class VideoAnomalyDetector:
|
13 |
+
def __init__(self, api_key, model="gpt-4o"):
|
14 |
+
"""
|
15 |
+
Initialize the VideoAnomalyDetector with OpenAI API key.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
api_key (str): OpenAI API key for accessing GPT-4o model
|
19 |
+
model (str): Model to use for analysis ("gpt-4o" or "gpt-4o-mini")
|
20 |
+
"""
|
21 |
+
self.client = OpenAI(api_key=api_key)
|
22 |
+
self.model = model
|
23 |
+
|
24 |
+
def extract_frames(self, video_path, skip_frames):
|
25 |
+
"""
|
26 |
+
Extract frames from a video file, skipping the specified number of frames.
|
27 |
+
|
28 |
+
Args:
|
29 |
+
video_path (str): Path to the video file
|
30 |
+
skip_frames (int): Number of frames to skip between captures
|
31 |
+
|
32 |
+
Returns:
|
33 |
+
list: List of extracted frames as numpy arrays
|
34 |
+
"""
|
35 |
+
frames = []
|
36 |
+
# Use the default backend for video files
|
37 |
+
# DirectShow can cause issues with some video files
|
38 |
+
cap = cv2.VideoCapture(video_path)
|
39 |
+
|
40 |
+
if not cap.isOpened():
|
41 |
+
raise ValueError(f"Could not open video file: {video_path}")
|
42 |
+
|
43 |
+
# Don't set MJPG format for video files as it can interfere with proper decoding
|
44 |
+
# cap.set(cv2.CAP_PROP_FOURCC, cv2.VideoWriter_fourcc('M','J','P','G'))
|
45 |
+
|
46 |
+
frame_count = 0
|
47 |
+
while True:
|
48 |
+
ret, frame = cap.read()
|
49 |
+
if not ret:
|
50 |
+
break
|
51 |
+
|
52 |
+
if frame_count % (skip_frames + 1) == 0:
|
53 |
+
# Convert from BGR to RGB
|
54 |
+
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
55 |
+
frames.append(rgb_frame)
|
56 |
+
|
57 |
+
frame_count += 1
|
58 |
+
|
59 |
+
cap.release()
|
60 |
+
return frames
|
61 |
+
|
62 |
+
def process_live_stream(self, stream_source, skip_frames, prompt, analysis_depth="granular", max_frames=None, callback=None, time_interval=None):
|
63 |
+
"""
|
64 |
+
Process frames from a live video stream.
|
65 |
+
|
66 |
+
Args:
|
67 |
+
stream_source: Stream source (0 for webcam, URL for IP camera or RTSP stream)
|
68 |
+
skip_frames (int): Number of frames to skip between captures
|
69 |
+
prompt (str): Prompt describing what anomaly to look for
|
70 |
+
analysis_depth (str): "granular" for frame-by-frame analysis or "cumulative" for overall analysis
|
71 |
+
max_frames (int, optional): Maximum number of frames to process (None for unlimited)
|
72 |
+
callback (function, optional): Callback function to report progress
|
73 |
+
time_interval (int, optional): If set, capture one frame every X seconds instead of using skip_frames
|
74 |
+
|
75 |
+
Returns:
|
76 |
+
list or dict: List of analysis results for each processed frame (granular) or dict with cumulative analysis (cumulative)
|
77 |
+
"""
|
78 |
+
# Open the video stream with appropriate backend
|
79 |
+
# Only use DirectShow for local webcams (0 or 1) on Windows
|
80 |
+
if os.name == 'nt' and (stream_source == 0 or stream_source == 1):
|
81 |
+
# This is a local webcam on Windows, use DirectShow
|
82 |
+
cap = cv2.VideoCapture(stream_source, cv2.CAP_DSHOW)
|
83 |
+
|
84 |
+
# For webcams, MJPG format can be more stable
|
85 |
+
cap.set(cv2.CAP_PROP_FOURCC, cv2.VideoWriter_fourcc('M','J','P','G'))
|
86 |
+
else:
|
87 |
+
# For IP cameras, RTSP streams, or non-Windows systems, use default backend
|
88 |
+
cap = cv2.VideoCapture(stream_source)
|
89 |
+
|
90 |
+
if not cap.isOpened():
|
91 |
+
raise ValueError(f"Could not open video stream: {stream_source}")
|
92 |
+
|
93 |
+
frames = []
|
94 |
+
frame_count = 0
|
95 |
+
processed_count = 0
|
96 |
+
last_capture_time = time.time()
|
97 |
+
|
98 |
+
try:
|
99 |
+
while True:
|
100 |
+
ret, frame = cap.read()
|
101 |
+
if not ret:
|
102 |
+
break
|
103 |
+
|
104 |
+
current_time = time.time()
|
105 |
+
|
106 |
+
# Determine if we should capture this frame
|
107 |
+
should_capture = False
|
108 |
+
|
109 |
+
if time_interval is not None:
|
110 |
+
# Time-based interval mode
|
111 |
+
if current_time - last_capture_time >= time_interval:
|
112 |
+
should_capture = True
|
113 |
+
last_capture_time = current_time
|
114 |
+
else:
|
115 |
+
# Frame-skip mode
|
116 |
+
if frame_count % (skip_frames + 1) == 0:
|
117 |
+
should_capture = True
|
118 |
+
|
119 |
+
if should_capture:
|
120 |
+
# Convert from BGR to RGB
|
121 |
+
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
122 |
+
frames.append(rgb_frame)
|
123 |
+
processed_count += 1
|
124 |
+
|
125 |
+
# Process the frame immediately in time interval mode
|
126 |
+
if time_interval is not None:
|
127 |
+
# Process the frame immediately
|
128 |
+
result = self.analyze_frame(rgb_frame, prompt)
|
129 |
+
|
130 |
+
# Make sure the frame is included in the result
|
131 |
+
if "frame" not in result:
|
132 |
+
result["frame"] = rgb_frame
|
133 |
+
|
134 |
+
# If we have a callback, call it with -1 for total to indicate continuous mode
|
135 |
+
if callback:
|
136 |
+
callback(processed_count, -1)
|
137 |
+
|
138 |
+
# In time interval mode, we yield results one by one
|
139 |
+
if analysis_depth == "granular":
|
140 |
+
yield result
|
141 |
+
else:
|
142 |
+
# For cumulative analysis, we need to keep all frames
|
143 |
+
# We'll handle this outside the loop
|
144 |
+
pass
|
145 |
+
else:
|
146 |
+
# Update progress if callback is provided (frame count mode)
|
147 |
+
if callback and max_frames:
|
148 |
+
callback(processed_count, max_frames)
|
149 |
+
|
150 |
+
# Break if we've reached the maximum number of frames
|
151 |
+
if max_frames and processed_count >= max_frames:
|
152 |
+
break
|
153 |
+
|
154 |
+
frame_count += 1
|
155 |
+
finally:
|
156 |
+
cap.release()
|
157 |
+
|
158 |
+
# If we're in time interval mode with cumulative analysis, we don't return here
|
159 |
+
# as we're yielding results above
|
160 |
+
if time_interval is not None and analysis_depth == "cumulative":
|
161 |
+
# This is a special case - we need to periodically do cumulative analysis
|
162 |
+
# For simplicity, we'll just return the current cumulative analysis
|
163 |
+
result = self.analyze_frames_cumulatively(frames, prompt, callback)
|
164 |
+
yield result
|
165 |
+
return
|
166 |
+
|
167 |
+
# Process the collected frames for non-time interval mode
|
168 |
+
if time_interval is None:
|
169 |
+
if analysis_depth == "cumulative":
|
170 |
+
return self.analyze_frames_cumulatively(frames, prompt, callback)
|
171 |
+
else: # granular (default)
|
172 |
+
results = []
|
173 |
+
|
174 |
+
for i, frame in enumerate(frames):
|
175 |
+
if callback:
|
176 |
+
callback(i, len(frames))
|
177 |
+
|
178 |
+
result = self.analyze_frame(frame, prompt)
|
179 |
+
results.append(result)
|
180 |
+
|
181 |
+
return results
|
182 |
+
|
183 |
+
def encode_image_to_base64(self, image_array):
|
184 |
+
"""
|
185 |
+
Convert a numpy array image to base64 encoded string.
|
186 |
+
|
187 |
+
Args:
|
188 |
+
image_array (numpy.ndarray): Image as numpy array
|
189 |
+
|
190 |
+
Returns:
|
191 |
+
str: Base64 encoded image string
|
192 |
+
"""
|
193 |
+
# Convert numpy array to PIL Image
|
194 |
+
pil_image = Image.fromarray(image_array)
|
195 |
+
|
196 |
+
# Create a bytes buffer
|
197 |
+
buffer = io.BytesIO()
|
198 |
+
|
199 |
+
# Save the image to the buffer in PNG format
|
200 |
+
pil_image.save(buffer, format="PNG")
|
201 |
+
|
202 |
+
# Get the bytes from the buffer
|
203 |
+
img_bytes = buffer.getvalue()
|
204 |
+
|
205 |
+
# Encode the bytes to base64
|
206 |
+
base64_encoded = base64.b64encode(img_bytes).decode('utf-8')
|
207 |
+
|
208 |
+
return base64_encoded
|
209 |
+
|
210 |
+
def analyze_frame(self, frame, prompt):
|
211 |
+
"""
|
212 |
+
Analyze a frame using the selected OpenAI model.
|
213 |
+
|
214 |
+
Args:
|
215 |
+
frame (numpy.ndarray): Frame to analyze
|
216 |
+
prompt (str): Prompt describing what anomaly to look for
|
217 |
+
|
218 |
+
Returns:
|
219 |
+
dict: Analysis result from the model
|
220 |
+
"""
|
221 |
+
base64_image = self.encode_image_to_base64(frame)
|
222 |
+
|
223 |
+
# Enhanced prompt to get structured information about anomalies
|
224 |
+
enhanced_prompt = f"""
|
225 |
+
{prompt}
|
226 |
+
|
227 |
+
After your analysis, please include a structured assessment at the end of your response in this exact format:
|
228 |
+
ANOMALY_DETECTED: [Yes/No]
|
229 |
+
ANOMALY_TYPE: [Human/Non-human/None]
|
230 |
+
CONFIDENCE: [0-100]
|
231 |
+
|
232 |
+
For ANOMALY_DETECTED, answer "Yes" if you detect any anomaly, otherwise "No".
|
233 |
+
For ANOMALY_TYPE, if an anomaly is detected, classify it as either "Human" (if it involves people or human activities) or "Non-human" (if it involves objects, animals, or environmental factors). If no anomaly is detected, use "None".
|
234 |
+
For CONFIDENCE, provide a number from 0 to 100 indicating your confidence level in the assessment.
|
235 |
+
"""
|
236 |
+
|
237 |
+
try:
|
238 |
+
response = self.client.chat.completions.create(
|
239 |
+
model=self.model,
|
240 |
+
messages=[
|
241 |
+
{
|
242 |
+
"role": "user",
|
243 |
+
"content": [
|
244 |
+
{"type": "text", "text": enhanced_prompt},
|
245 |
+
{
|
246 |
+
"type": "image_url",
|
247 |
+
"image_url": {
|
248 |
+
"url": f"data:image/png;base64,{base64_image}"
|
249 |
+
}
|
250 |
+
}
|
251 |
+
]
|
252 |
+
}
|
253 |
+
],
|
254 |
+
max_tokens=1000
|
255 |
+
)
|
256 |
+
|
257 |
+
# Extract the response text
|
258 |
+
response_text = response.choices[0].message.content
|
259 |
+
|
260 |
+
# Extract anomaly detection information using regex
|
261 |
+
anomaly_detected = False
|
262 |
+
anomaly_type = "None"
|
263 |
+
confidence = 0
|
264 |
+
|
265 |
+
# Look for the structured format
|
266 |
+
anomaly_match = re.search(r'ANOMALY_DETECTED:\s*(Yes|No)', response_text, re.IGNORECASE)
|
267 |
+
if anomaly_match and anomaly_match.group(1).lower() == 'yes':
|
268 |
+
anomaly_detected = True
|
269 |
+
confidence = 90 # Default high confidence when anomaly is detected
|
270 |
+
|
271 |
+
# If anomaly detected, look for the type
|
272 |
+
type_match = re.search(r'ANOMALY_TYPE:\s*(Human|Non-human|None)', response_text, re.IGNORECASE)
|
273 |
+
if type_match:
|
274 |
+
anomaly_type = type_match.group(1)
|
275 |
+
|
276 |
+
# Look for confidence information
|
277 |
+
conf_match = re.search(r'CONFIDENCE:\s*(\d+)', response_text, re.IGNORECASE)
|
278 |
+
if conf_match:
|
279 |
+
try:
|
280 |
+
confidence = int(conf_match.group(1))
|
281 |
+
except:
|
282 |
+
pass # Keep default confidence if parsing fails
|
283 |
+
|
284 |
+
return {
|
285 |
+
"text": response_text,
|
286 |
+
"analysis": response_text, # Add analysis field as an alias for text
|
287 |
+
"frame": frame,
|
288 |
+
"anomaly_detected": anomaly_detected,
|
289 |
+
"anomaly_type": anomaly_type,
|
290 |
+
"confidence": confidence, # Add confidence field
|
291 |
+
"timestamp": time.time() # Add timestamp for live stream analysis
|
292 |
+
}
|
293 |
+
except Exception as e:
|
294 |
+
return {
|
295 |
+
"error": str(e),
|
296 |
+
"frame": frame,
|
297 |
+
"anomaly_detected": False,
|
298 |
+
"anomaly_type": "None",
|
299 |
+
"confidence": 0, # Add confidence field
|
300 |
+
"timestamp": time.time() # Add timestamp for live stream analysis
|
301 |
+
}
|
302 |
+
|
303 |
+
def analyze_frames_cumulatively(self, frames, prompt, callback=None):
|
304 |
+
"""
|
305 |
+
Analyze all frames together and provide a cumulative analysis.
|
306 |
+
|
307 |
+
Args:
|
308 |
+
frames (list): List of frames to analyze
|
309 |
+
prompt (str): Prompt describing what anomaly to look for
|
310 |
+
callback (function, optional): Callback function to report progress
|
311 |
+
|
312 |
+
Returns:
|
313 |
+
dict: Cumulative analysis result
|
314 |
+
"""
|
315 |
+
# First, analyze each frame individually to identify potential anomalies
|
316 |
+
individual_results = []
|
317 |
+
for i, frame in enumerate(frames):
|
318 |
+
if callback:
|
319 |
+
callback(i, len(frames) * 2) # First half of progress for individual analysis
|
320 |
+
|
321 |
+
result = self.analyze_frame(frame, f"{prompt} Provide a brief analysis of this frame only.")
|
322 |
+
individual_results.append(result)
|
323 |
+
|
324 |
+
# Identify frames with potential anomalies
|
325 |
+
anomaly_frames = []
|
326 |
+
anomaly_descriptions = []
|
327 |
+
anomaly_types = []
|
328 |
+
|
329 |
+
for i, result in enumerate(individual_results):
|
330 |
+
if "error" not in result and result["anomaly_detected"]:
|
331 |
+
anomaly_frames.append(result["frame"])
|
332 |
+
anomaly_descriptions.append(f"Frame {i+1}: {result['text']}")
|
333 |
+
anomaly_types.append(result["anomaly_type"])
|
334 |
+
|
335 |
+
# Limit to 3 anomaly frames
|
336 |
+
if len(anomaly_frames) >= 3:
|
337 |
+
break
|
338 |
+
|
339 |
+
# If no anomalies were detected, use the first, middle, and last frames
|
340 |
+
if not anomaly_frames and len(frames) > 0:
|
341 |
+
if len(frames) == 1:
|
342 |
+
anomaly_frames = [frames[0]]
|
343 |
+
elif len(frames) == 2:
|
344 |
+
anomaly_frames = [frames[0], frames[1]]
|
345 |
+
else:
|
346 |
+
anomaly_frames = [
|
347 |
+
frames[0],
|
348 |
+
frames[len(frames) // 2],
|
349 |
+
frames[-1]
|
350 |
+
]
|
351 |
+
|
352 |
+
# Limit to max 3 frames
|
353 |
+
anomaly_frames = anomaly_frames[:3]
|
354 |
+
|
355 |
+
# Create a cumulative analysis prompt with the anomaly descriptions
|
356 |
+
cumulative_prompt = f"""
|
357 |
+
{prompt}
|
358 |
+
|
359 |
+
Based on the analysis of all frames, provide a comprehensive summary of any anomalies detected in the video. Focus on patterns or recurring issues. Here are some notable observations from individual frames:
|
360 |
+
|
361 |
+
{chr(10).join(anomaly_descriptions[:5])}
|
362 |
+
|
363 |
+
After your analysis, please include a structured assessment at the end of your response in this exact format:
|
364 |
+
ANOMALY_DETECTED: [Yes/No]
|
365 |
+
ANOMALY_TYPE: [Human/Non-human/None]
|
366 |
+
|
367 |
+
For ANOMALY_DETECTED, answer "Yes" if you detect any anomaly across the video, otherwise "No".
|
368 |
+
For ANOMALY_TYPE, if an anomaly is detected, classify the predominant type as either "Human" (if it involves people or human activities) or "Non-human" (if it involves objects, animals, or environmental factors). If no anomaly is detected, use "None".
|
369 |
+
"""
|
370 |
+
|
371 |
+
try:
|
372 |
+
if callback:
|
373 |
+
callback(len(frames), len(frames) * 2) # Second half of progress for cumulative analysis
|
374 |
+
|
375 |
+
# Encode the selected frames
|
376 |
+
base64_images = [self.encode_image_to_base64(frame) for frame in anomaly_frames]
|
377 |
+
|
378 |
+
# Create the content for the API call
|
379 |
+
content = [{"type": "text", "text": cumulative_prompt}]
|
380 |
+
|
381 |
+
# Add the images
|
382 |
+
for base64_image in base64_images:
|
383 |
+
content.append({
|
384 |
+
"type": "image_url",
|
385 |
+
"image_url": {
|
386 |
+
"url": f"data:image/png;base64,{base64_image}"
|
387 |
+
}
|
388 |
+
})
|
389 |
+
|
390 |
+
response = self.client.chat.completions.create(
|
391 |
+
model=self.model,
|
392 |
+
messages=[
|
393 |
+
{
|
394 |
+
"role": "user",
|
395 |
+
"content": content
|
396 |
+
}
|
397 |
+
],
|
398 |
+
max_tokens=1500
|
399 |
+
)
|
400 |
+
|
401 |
+
# Extract the response text
|
402 |
+
response_text = response.choices[0].message.content
|
403 |
+
|
404 |
+
# Extract anomaly detection information using regex
|
405 |
+
anomaly_detected = False
|
406 |
+
anomaly_type = "None"
|
407 |
+
|
408 |
+
# Look for the structured format
|
409 |
+
anomaly_match = re.search(r'ANOMALY_DETECTED:\s*(Yes|No)', response_text, re.IGNORECASE)
|
410 |
+
if anomaly_match and anomaly_match.group(1).lower() == 'yes':
|
411 |
+
anomaly_detected = True
|
412 |
+
|
413 |
+
# If anomaly detected, look for the type
|
414 |
+
type_match = re.search(r'ANOMALY_TYPE:\s*(Human|Non-human|None)', response_text, re.IGNORECASE)
|
415 |
+
if type_match:
|
416 |
+
anomaly_type = type_match.group(1)
|
417 |
+
|
418 |
+
return {
|
419 |
+
"text": response_text,
|
420 |
+
"frames": anomaly_frames,
|
421 |
+
"anomaly_detected": anomaly_detected,
|
422 |
+
"anomaly_type": anomaly_type,
|
423 |
+
"timestamp": time.time() # Add timestamp for live stream analysis
|
424 |
+
}
|
425 |
+
except Exception as e:
|
426 |
+
return {
|
427 |
+
"error": str(e),
|
428 |
+
"frames": anomaly_frames,
|
429 |
+
"anomaly_detected": False,
|
430 |
+
"anomaly_type": "None",
|
431 |
+
"timestamp": time.time() # Add timestamp for live stream analysis
|
432 |
+
}
|
433 |
+
|
434 |
+
def process_video(self, video_path, skip_frames, prompt, analysis_depth="granular", callback=None):
|
435 |
+
"""
|
436 |
+
Process a video file, extracting frames and analyzing them for anomalies.
|
437 |
+
|
438 |
+
Args:
|
439 |
+
video_path (str): Path to the video file
|
440 |
+
skip_frames (int): Number of frames to skip between captures
|
441 |
+
prompt (str): Prompt describing what anomaly to look for
|
442 |
+
analysis_depth (str): "granular" for frame-by-frame analysis or "cumulative" for overall analysis
|
443 |
+
callback (function, optional): Callback function to report progress
|
444 |
+
|
445 |
+
Returns:
|
446 |
+
list or dict: List of analysis results for each processed frame (granular) or dict with cumulative analysis (cumulative)
|
447 |
+
"""
|
448 |
+
frames = self.extract_frames(video_path, skip_frames)
|
449 |
+
|
450 |
+
if analysis_depth == "cumulative":
|
451 |
+
return self.analyze_frames_cumulatively(frames, prompt, callback)
|
452 |
+
else: # granular (default)
|
453 |
+
results = []
|
454 |
+
|
455 |
+
for i, frame in enumerate(frames):
|
456 |
+
if callback:
|
457 |
+
if analysis_depth == "cumulative":
|
458 |
+
callback(i, len(frames) / 2)
|
459 |
+
else:
|
460 |
+
callback(i, len(frames))
|
461 |
+
|
462 |
+
result = self.analyze_frame(frame, prompt)
|
463 |
+
results.append(result)
|
464 |
+
|
465 |
+
return results
|
phi4_detector.py
ADDED
@@ -0,0 +1,500 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Phi-4 model implementation for video anomaly detection using Hugging Face transformers.
|
3 |
+
"""
|
4 |
+
|
5 |
+
import cv2
|
6 |
+
import numpy as np
|
7 |
+
import base64
|
8 |
+
import os
|
9 |
+
import tempfile
|
10 |
+
from PIL import Image
|
11 |
+
import io
|
12 |
+
import re
|
13 |
+
import torch
|
14 |
+
import time
|
15 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor
|
16 |
+
from huggingface_hub import snapshot_download
|
17 |
+
|
18 |
+
class Phi4AnomalyDetector:
|
19 |
+
def __init__(self, model_name="microsoft/Phi-4-multimodal-instruct"):
|
20 |
+
"""
|
21 |
+
Initialize the Phi4AnomalyDetector with the Phi-4 vision model.
|
22 |
+
|
23 |
+
Args:
|
24 |
+
model_name (str): Name of the Phi-4 vision model on Hugging Face
|
25 |
+
"""
|
26 |
+
self.model_name = model_name
|
27 |
+
self.model_dir = os.path.join(os.getcwd(), "phi4_model")
|
28 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
29 |
+
|
30 |
+
# Load or download the model
|
31 |
+
self.load_model()
|
32 |
+
|
33 |
+
def load_model(self):
|
34 |
+
"""
|
35 |
+
Load the Phi-4 model from local directory or download from Hugging Face.
|
36 |
+
"""
|
37 |
+
try:
|
38 |
+
if not os.path.exists(self.model_dir):
|
39 |
+
print(f"Downloading {self.model_name} model to {self.model_dir}...")
|
40 |
+
snapshot_download(repo_id=self.model_name, local_dir=self.model_dir)
|
41 |
+
print("Model downloaded successfully.")
|
42 |
+
else:
|
43 |
+
print(f"Using existing model from {self.model_dir}")
|
44 |
+
|
45 |
+
# Load model components with trust_remote_code=True
|
46 |
+
self.processor = AutoProcessor.from_pretrained(
|
47 |
+
self.model_dir,
|
48 |
+
trust_remote_code=True
|
49 |
+
)
|
50 |
+
|
51 |
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
52 |
+
self.model_dir,
|
53 |
+
trust_remote_code=True
|
54 |
+
)
|
55 |
+
|
56 |
+
# Load model with appropriate dtype based on device
|
57 |
+
if self.device == "cuda":
|
58 |
+
self.model = AutoModelForCausalLM.from_pretrained(
|
59 |
+
self.model_dir,
|
60 |
+
torch_dtype=torch.float16,
|
61 |
+
device_map="auto",
|
62 |
+
trust_remote_code=True
|
63 |
+
)
|
64 |
+
else:
|
65 |
+
self.model = AutoModelForCausalLM.from_pretrained(
|
66 |
+
self.model_dir,
|
67 |
+
device_map="auto",
|
68 |
+
trust_remote_code=True
|
69 |
+
)
|
70 |
+
|
71 |
+
print(f"Phi-4 model loaded successfully on {self.device}")
|
72 |
+
|
73 |
+
except Exception as e:
|
74 |
+
raise RuntimeError(f"Failed to load Phi-4 model: {str(e)}")
|
75 |
+
|
76 |
+
def extract_frames(self, video_path, skip_frames):
|
77 |
+
"""
|
78 |
+
Extract frames from a video file, skipping the specified number of frames.
|
79 |
+
|
80 |
+
Args:
|
81 |
+
video_path (str): Path to the video file
|
82 |
+
skip_frames (int): Number of frames to skip between captures
|
83 |
+
|
84 |
+
Returns:
|
85 |
+
list: List of extracted frames as numpy arrays
|
86 |
+
"""
|
87 |
+
frames = []
|
88 |
+
# Use the default backend for video files
|
89 |
+
# DirectShow can cause issues with some video files
|
90 |
+
cap = cv2.VideoCapture(video_path)
|
91 |
+
|
92 |
+
# Don't set MJPG format for video files as it can interfere with proper decoding
|
93 |
+
# cap.set(cv2.CAP_PROP_FOURCC, cv2.VideoWriter_fourcc('M','J','P','G'))
|
94 |
+
|
95 |
+
if not cap.isOpened():
|
96 |
+
raise ValueError(f"Could not open video file: {video_path}")
|
97 |
+
|
98 |
+
frame_count = 0
|
99 |
+
while True:
|
100 |
+
ret, frame = cap.read()
|
101 |
+
if not ret:
|
102 |
+
break
|
103 |
+
|
104 |
+
if frame_count % (skip_frames + 1) == 0:
|
105 |
+
# Convert from BGR to RGB
|
106 |
+
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
107 |
+
frames.append(rgb_frame)
|
108 |
+
|
109 |
+
frame_count += 1
|
110 |
+
|
111 |
+
cap.release()
|
112 |
+
return frames
|
113 |
+
|
114 |
+
def process_live_stream(self, stream_source, skip_frames, prompt, analysis_depth="granular", max_frames=None, callback=None, time_interval=None):
|
115 |
+
"""
|
116 |
+
Process frames from a live video stream.
|
117 |
+
|
118 |
+
Args:
|
119 |
+
stream_source: Stream source (0 for webcam, URL for IP camera or RTSP stream)
|
120 |
+
skip_frames (int): Number of frames to skip between captures
|
121 |
+
prompt (str): Prompt describing what anomaly to look for
|
122 |
+
analysis_depth (str): "granular" for frame-by-frame analysis or "cumulative" for overall analysis
|
123 |
+
max_frames (int, optional): Maximum number of frames to process (None for unlimited)
|
124 |
+
callback (function, optional): Callback function to report progress
|
125 |
+
time_interval (int, optional): If set, capture one frame every X seconds instead of using skip_frames
|
126 |
+
|
127 |
+
Returns:
|
128 |
+
list or dict: List of analysis results for each processed frame (granular) or dict with cumulative analysis (cumulative)
|
129 |
+
"""
|
130 |
+
# Open the video stream with appropriate backend
|
131 |
+
# Only use DirectShow for local webcams (0 or 1) on Windows
|
132 |
+
if os.name == 'nt' and (stream_source == 0 or stream_source == 1):
|
133 |
+
# This is a local webcam on Windows, use DirectShow
|
134 |
+
cap = cv2.VideoCapture(stream_source, cv2.CAP_DSHOW)
|
135 |
+
|
136 |
+
# For webcams, MJPG format can be more stable
|
137 |
+
cap.set(cv2.CAP_PROP_FOURCC, cv2.VideoWriter_fourcc('M','J','P','G'))
|
138 |
+
else:
|
139 |
+
# For IP cameras, RTSP streams, or non-Windows systems, use default backend
|
140 |
+
cap = cv2.VideoCapture(stream_source)
|
141 |
+
|
142 |
+
if not cap.isOpened():
|
143 |
+
raise ValueError(f"Could not open video stream: {stream_source}")
|
144 |
+
|
145 |
+
frames = []
|
146 |
+
frame_count = 0
|
147 |
+
processed_count = 0
|
148 |
+
last_capture_time = time.time()
|
149 |
+
|
150 |
+
try:
|
151 |
+
while True:
|
152 |
+
ret, frame = cap.read()
|
153 |
+
if not ret:
|
154 |
+
break
|
155 |
+
|
156 |
+
current_time = time.time()
|
157 |
+
|
158 |
+
# Determine if we should capture this frame
|
159 |
+
should_capture = False
|
160 |
+
|
161 |
+
if time_interval is not None:
|
162 |
+
# Time-based interval mode
|
163 |
+
if current_time - last_capture_time >= time_interval:
|
164 |
+
should_capture = True
|
165 |
+
last_capture_time = current_time
|
166 |
+
else:
|
167 |
+
# Frame-skip mode
|
168 |
+
if frame_count % (skip_frames + 1) == 0:
|
169 |
+
should_capture = True
|
170 |
+
|
171 |
+
if should_capture:
|
172 |
+
# Convert from BGR to RGB
|
173 |
+
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
174 |
+
frames.append(rgb_frame)
|
175 |
+
processed_count += 1
|
176 |
+
|
177 |
+
# Process the frame immediately in time interval mode
|
178 |
+
if time_interval is not None:
|
179 |
+
# Process the frame immediately
|
180 |
+
result = self.analyze_frame(rgb_frame, prompt)
|
181 |
+
|
182 |
+
# Make sure the frame is included in the result
|
183 |
+
if "frame" not in result:
|
184 |
+
result["frame"] = rgb_frame
|
185 |
+
|
186 |
+
# If we have a callback, call it with -1 for total to indicate continuous mode
|
187 |
+
if callback:
|
188 |
+
callback(processed_count, -1)
|
189 |
+
|
190 |
+
# In time interval mode, we yield results one by one
|
191 |
+
if analysis_depth == "granular":
|
192 |
+
yield result
|
193 |
+
else:
|
194 |
+
# For cumulative analysis, we need to keep all frames
|
195 |
+
# We'll handle this outside the loop
|
196 |
+
pass
|
197 |
+
else:
|
198 |
+
# Update progress if callback is provided (frame count mode)
|
199 |
+
if callback and max_frames:
|
200 |
+
callback(processed_count, max_frames)
|
201 |
+
|
202 |
+
# Break if we've reached the maximum number of frames
|
203 |
+
if max_frames and processed_count >= max_frames:
|
204 |
+
break
|
205 |
+
|
206 |
+
frame_count += 1
|
207 |
+
finally:
|
208 |
+
cap.release()
|
209 |
+
|
210 |
+
# If we're in time interval mode with cumulative analysis, we don't return here
|
211 |
+
# as we're yielding results above
|
212 |
+
if time_interval is not None and analysis_depth == "cumulative":
|
213 |
+
# This is a special case - we need to periodically do cumulative analysis
|
214 |
+
# For simplicity, we'll just return the current cumulative analysis
|
215 |
+
result = self.analyze_frames_cumulatively(frames, prompt, callback)
|
216 |
+
yield result
|
217 |
+
return
|
218 |
+
|
219 |
+
# Process the collected frames for non-time interval mode
|
220 |
+
if time_interval is None:
|
221 |
+
if analysis_depth == "cumulative":
|
222 |
+
return self.analyze_frames_cumulatively(frames, prompt, callback)
|
223 |
+
else: # granular (default)
|
224 |
+
results = []
|
225 |
+
|
226 |
+
for i, frame in enumerate(frames):
|
227 |
+
if callback:
|
228 |
+
callback(i, len(frames))
|
229 |
+
|
230 |
+
result = self.analyze_frame(frame, prompt)
|
231 |
+
results.append(result)
|
232 |
+
|
233 |
+
return results
|
234 |
+
|
235 |
+
def analyze_frame(self, frame, prompt):
|
236 |
+
"""
|
237 |
+
Analyze a frame using the Phi-4 vision model.
|
238 |
+
|
239 |
+
Args:
|
240 |
+
frame (numpy.ndarray): Frame to analyze
|
241 |
+
prompt (str): Prompt describing what anomaly to look for
|
242 |
+
|
243 |
+
Returns:
|
244 |
+
dict: Analysis result from the model
|
245 |
+
"""
|
246 |
+
# Convert numpy array to PIL Image
|
247 |
+
pil_image = Image.fromarray(frame)
|
248 |
+
|
249 |
+
# Enhanced prompt to get structured information about anomalies
|
250 |
+
enhanced_prompt = f"""
|
251 |
+
{prompt}
|
252 |
+
|
253 |
+
After your analysis, please include a structured assessment at the end of your response in this exact format:
|
254 |
+
ANOMALY_DETECTED: [Yes/No]
|
255 |
+
ANOMALY_TYPE: [Human/Non-human/None]
|
256 |
+
CONFIDENCE: [0-100]
|
257 |
+
|
258 |
+
For ANOMALY_DETECTED, answer "Yes" if you detect any anomaly, otherwise "No".
|
259 |
+
For ANOMALY_TYPE, if an anomaly is detected, classify it as either "Human" (if it involves people or human activities) or "Non-human" (if it involves objects, animals, or environmental factors). If no anomaly is detected, use "None".
|
260 |
+
For CONFIDENCE, provide a number from 0 to 100 indicating your confidence level in the assessment.
|
261 |
+
"""
|
262 |
+
|
263 |
+
try:
|
264 |
+
# Process the image and prompt with the Phi-4 model
|
265 |
+
inputs = self.processor(text=enhanced_prompt, images=pil_image, return_tensors="pt")
|
266 |
+
|
267 |
+
# Move inputs to the same device as the model
|
268 |
+
for key in inputs:
|
269 |
+
if torch.is_tensor(inputs[key]):
|
270 |
+
inputs[key] = inputs[key].to(self.model.device)
|
271 |
+
|
272 |
+
# Generate response
|
273 |
+
with torch.no_grad():
|
274 |
+
outputs = self.model.generate(
|
275 |
+
**inputs,
|
276 |
+
max_new_tokens=500,
|
277 |
+
do_sample=False
|
278 |
+
)
|
279 |
+
|
280 |
+
# Decode the response
|
281 |
+
response_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
282 |
+
|
283 |
+
# Extract the part after the prompt
|
284 |
+
if enhanced_prompt in response_text:
|
285 |
+
response_text = response_text.split(enhanced_prompt)[-1].strip()
|
286 |
+
|
287 |
+
# Extract anomaly detection information using regex
|
288 |
+
anomaly_detected = False
|
289 |
+
anomaly_type = "None"
|
290 |
+
confidence = 0
|
291 |
+
|
292 |
+
# Look for the structured format
|
293 |
+
anomaly_match = re.search(r'ANOMALY_DETECTED:\s*(Yes|No)', response_text, re.IGNORECASE)
|
294 |
+
if anomaly_match and anomaly_match.group(1).lower() == 'yes':
|
295 |
+
anomaly_detected = True
|
296 |
+
confidence = 90 # Default high confidence when anomaly is detected
|
297 |
+
|
298 |
+
# If anomaly detected, look for the type
|
299 |
+
type_match = re.search(r'ANOMALY_TYPE:\s*(Human|Non-human|None)', response_text, re.IGNORECASE)
|
300 |
+
if type_match:
|
301 |
+
anomaly_type = type_match.group(1)
|
302 |
+
|
303 |
+
# Look for confidence information
|
304 |
+
conf_match = re.search(r'CONFIDENCE:\s*(\d+)', response_text, re.IGNORECASE)
|
305 |
+
if conf_match:
|
306 |
+
try:
|
307 |
+
confidence = int(conf_match.group(1))
|
308 |
+
except:
|
309 |
+
pass # Keep default confidence if parsing fails
|
310 |
+
|
311 |
+
return {
|
312 |
+
"text": response_text,
|
313 |
+
"analysis": response_text, # Add analysis field as an alias for text
|
314 |
+
"frame": frame,
|
315 |
+
"anomaly_detected": anomaly_detected,
|
316 |
+
"anomaly_type": anomaly_type,
|
317 |
+
"confidence": confidence, # Add confidence field
|
318 |
+
"timestamp": time.time() # Add timestamp for live stream analysis
|
319 |
+
}
|
320 |
+
except Exception as e:
|
321 |
+
return {
|
322 |
+
"error": str(e),
|
323 |
+
"frame": frame,
|
324 |
+
"anomaly_detected": False,
|
325 |
+
"anomaly_type": "None",
|
326 |
+
"confidence": 0, # Add default confidence for error
|
327 |
+
"timestamp": time.time() # Add timestamp for live stream analysis
|
328 |
+
}
|
329 |
+
|
330 |
+
def analyze_frames_cumulatively(self, frames, prompt, callback=None):
|
331 |
+
"""
|
332 |
+
Analyze all frames together and provide a cumulative analysis.
|
333 |
+
|
334 |
+
Args:
|
335 |
+
frames (list): List of frames to analyze
|
336 |
+
prompt (str): Prompt describing what anomaly to look for
|
337 |
+
callback (function, optional): Callback function to report progress
|
338 |
+
|
339 |
+
Returns:
|
340 |
+
dict: Cumulative analysis result
|
341 |
+
"""
|
342 |
+
# First, analyze each frame individually to identify potential anomalies
|
343 |
+
individual_results = []
|
344 |
+
for i, frame in enumerate(frames):
|
345 |
+
if callback:
|
346 |
+
callback(i, len(frames) * 2) # First half of progress for individual analysis
|
347 |
+
|
348 |
+
result = self.analyze_frame(frame, f"{prompt} Provide a brief analysis of this frame only.")
|
349 |
+
individual_results.append(result)
|
350 |
+
|
351 |
+
# Identify frames with potential anomalies
|
352 |
+
anomaly_frames = []
|
353 |
+
anomaly_descriptions = []
|
354 |
+
anomaly_types = []
|
355 |
+
|
356 |
+
for i, result in enumerate(individual_results):
|
357 |
+
if "error" not in result and result["anomaly_detected"]:
|
358 |
+
anomaly_frames.append(result["frame"])
|
359 |
+
anomaly_descriptions.append(f"Frame {i+1}: {result['text']}")
|
360 |
+
anomaly_types.append(result["anomaly_type"])
|
361 |
+
|
362 |
+
# Limit to 3 anomaly frames
|
363 |
+
if len(anomaly_frames) >= 3:
|
364 |
+
break
|
365 |
+
|
366 |
+
# If no anomalies were detected, use the first, middle, and last frames
|
367 |
+
if not anomaly_frames and len(frames) > 0:
|
368 |
+
if len(frames) == 1:
|
369 |
+
anomaly_frames = [frames[0]]
|
370 |
+
elif len(frames) == 2:
|
371 |
+
anomaly_frames = [frames[0], frames[1]]
|
372 |
+
else:
|
373 |
+
anomaly_frames = [
|
374 |
+
frames[0],
|
375 |
+
frames[len(frames) // 2],
|
376 |
+
frames[-1]
|
377 |
+
]
|
378 |
+
|
379 |
+
# Limit to max 3 frames
|
380 |
+
anomaly_frames = anomaly_frames[:3]
|
381 |
+
|
382 |
+
# Create a cumulative analysis prompt with the anomaly descriptions
|
383 |
+
cumulative_prompt = f"""
|
384 |
+
{prompt}
|
385 |
+
|
386 |
+
Based on the analysis of all frames, provide a comprehensive summary of any anomalies detected in the video. Focus on patterns or recurring issues. Here are some notable observations from individual frames:
|
387 |
+
|
388 |
+
{chr(10).join(anomaly_descriptions[:5])}
|
389 |
+
|
390 |
+
After your analysis, please include a structured assessment at the end of your response in this exact format:
|
391 |
+
ANOMALY_DETECTED: [Yes/No]
|
392 |
+
ANOMALY_TYPE: [Human/Non-human/None]
|
393 |
+
|
394 |
+
For ANOMALY_DETECTED, answer "Yes" if you detect any anomaly across the video, otherwise "No".
|
395 |
+
For ANOMALY_TYPE, if an anomaly is detected, classify the predominant type as either "Human" (if it involves people or human activities) or "Non-human" (if it involves objects, animals, or environmental factors). If no anomaly is detected, use "None".
|
396 |
+
"""
|
397 |
+
|
398 |
+
try:
|
399 |
+
if callback:
|
400 |
+
callback(len(frames), len(frames) * 2) # Second half of progress for cumulative analysis
|
401 |
+
|
402 |
+
# For cumulative analysis, we'll use the first anomaly frame (or first frame if no anomalies)
|
403 |
+
representative_frame = anomaly_frames[0] if anomaly_frames else frames[0]
|
404 |
+
pil_image = Image.fromarray(representative_frame)
|
405 |
+
|
406 |
+
# Process with Phi-4
|
407 |
+
inputs = self.processor(text=cumulative_prompt, images=pil_image, return_tensors="pt")
|
408 |
+
|
409 |
+
# Move inputs to the same device as the model
|
410 |
+
for key in inputs:
|
411 |
+
if torch.is_tensor(inputs[key]):
|
412 |
+
inputs[key] = inputs[key].to(self.model.device)
|
413 |
+
|
414 |
+
# Generate response
|
415 |
+
with torch.no_grad():
|
416 |
+
outputs = self.model.generate(
|
417 |
+
**inputs,
|
418 |
+
max_new_tokens=1000,
|
419 |
+
do_sample=False
|
420 |
+
)
|
421 |
+
|
422 |
+
# Decode the response
|
423 |
+
response_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
424 |
+
|
425 |
+
# Extract the part after the prompt
|
426 |
+
if cumulative_prompt in response_text:
|
427 |
+
response_text = response_text.split(cumulative_prompt)[-1].strip()
|
428 |
+
|
429 |
+
# Extract anomaly detection information using regex
|
430 |
+
anomaly_detected = False
|
431 |
+
anomaly_type = "None"
|
432 |
+
confidence = 0
|
433 |
+
|
434 |
+
# Look for the structured format
|
435 |
+
anomaly_match = re.search(r'ANOMALY_DETECTED:\s*(Yes|No)', response_text, re.IGNORECASE)
|
436 |
+
if anomaly_match and anomaly_match.group(1).lower() == 'yes':
|
437 |
+
anomaly_detected = True
|
438 |
+
confidence = 90 # Default high confidence when anomaly is detected
|
439 |
+
|
440 |
+
# If anomaly detected, look for the type
|
441 |
+
type_match = re.search(r'ANOMALY_TYPE:\s*(Human|Non-human|None)', response_text, re.IGNORECASE)
|
442 |
+
if type_match:
|
443 |
+
anomaly_type = type_match.group(1)
|
444 |
+
|
445 |
+
# Look for confidence information
|
446 |
+
conf_match = re.search(r'CONFIDENCE:\s*(\d+)', response_text, re.IGNORECASE)
|
447 |
+
if conf_match:
|
448 |
+
try:
|
449 |
+
confidence = int(conf_match.group(1))
|
450 |
+
except:
|
451 |
+
pass # Keep default confidence if parsing fails
|
452 |
+
|
453 |
+
return {
|
454 |
+
"text": response_text,
|
455 |
+
"analysis": response_text, # Add analysis field as an alias for text
|
456 |
+
"frames": anomaly_frames,
|
457 |
+
"anomaly_detected": anomaly_detected,
|
458 |
+
"anomaly_type": anomaly_type,
|
459 |
+
"confidence": confidence, # Add confidence field
|
460 |
+
"timestamp": time.time() # Add timestamp for live stream analysis
|
461 |
+
}
|
462 |
+
except Exception as e:
|
463 |
+
return {
|
464 |
+
"error": str(e),
|
465 |
+
"frames": anomaly_frames,
|
466 |
+
"anomaly_detected": False,
|
467 |
+
"anomaly_type": "None",
|
468 |
+
"confidence": 0, # Add default confidence for error
|
469 |
+
"timestamp": time.time() # Add timestamp for live stream analysis
|
470 |
+
}
|
471 |
+
|
472 |
+
def process_video(self, video_path, skip_frames, prompt, analysis_depth="granular", callback=None):
|
473 |
+
"""
|
474 |
+
Process a video file, extracting frames and analyzing them for anomalies.
|
475 |
+
|
476 |
+
Args:
|
477 |
+
video_path (str): Path to the video file
|
478 |
+
skip_frames (int): Number of frames to skip between captures
|
479 |
+
prompt (str): Prompt describing what anomaly to look for
|
480 |
+
analysis_depth (str): "granular" for frame-by-frame analysis or "cumulative" for overall analysis
|
481 |
+
callback (function, optional): Callback function to report progress
|
482 |
+
|
483 |
+
Returns:
|
484 |
+
list or dict: List of analysis results for each processed frame (granular) or dict with cumulative analysis (cumulative)
|
485 |
+
"""
|
486 |
+
frames = self.extract_frames(video_path, skip_frames)
|
487 |
+
|
488 |
+
if analysis_depth == "cumulative":
|
489 |
+
return self.analyze_frames_cumulatively(frames, prompt, callback)
|
490 |
+
else: # granular (default)
|
491 |
+
results = []
|
492 |
+
|
493 |
+
for i, frame in enumerate(frames):
|
494 |
+
if callback:
|
495 |
+
callback(i, len(frames))
|
496 |
+
|
497 |
+
result = self.analyze_frame(frame, prompt)
|
498 |
+
results.append(result)
|
499 |
+
|
500 |
+
return results
|
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
opencv-python==4.8.1.78
|
2 |
+
numpy==1.26.2
|
3 |
+
streamlit==1.31.0
|
4 |
+
python-dotenv==1.0.0
|
5 |
+
openai==1.3.7
|
6 |
+
Pillow==10.1.0
|
7 |
+
requests==2.31.0
|
8 |
+
colorama==0.4.6
|
9 |
+
torch>=2.0.0
|
10 |
+
transformers>=4.35.0
|
11 |
+
huggingface-hub>=0.19.0
|
12 |
+
accelerate>=0.25.0
|
vid-1.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b7f36ae934376c55a7ce02f06a48db1d1d1acd8e7a2ac254c131051aaacb7339
|
3 |
+
size 2725255
|
vid-2.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1e7f944017e7dc1d9c0f5b38338996e80f26d55535fffbfd232c6e16d2386d8a
|
3 |
+
size 880606
|