Spaces:
Sleeping
Sleeping
Commit
·
e4e56ea
1
Parent(s):
d214c78
Init app
Browse files- .streamlit/config.toml +11 -0
- README.md +3 -3
- app.py +33 -0
- requirements.txt +30 -0
- transcriber.py +68 -0
- ui_create_summary.py +0 -0
- ui_home.py +8 -0
- ui_result.py +0 -0
- ui_summarize.py +338 -0
- ui_transcribe.py +284 -0
- ui_upload.py +320 -0
- ui_video.py +216 -0
- utils.py +436 -0
.streamlit/config.toml
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[client]
|
2 |
+
showErrorDetails = true
|
3 |
+
|
4 |
+
[server]
|
5 |
+
headless = false
|
6 |
+
enableCORS = false
|
7 |
+
enableXsrfProtection = false
|
8 |
+
maxUploadSize = 5000
|
9 |
+
|
10 |
+
[theme]
|
11 |
+
primaryColor = "#FA8E00"
|
README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
---
|
2 |
title: Conspectum
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.44.1
|
8 |
app_file: app.py
|
|
|
1 |
---
|
2 |
title: Conspectum
|
3 |
+
emoji: 📚
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: yellow
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.44.1
|
8 |
app_file: app.py
|
app.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
### ICON BANK: 🗣️🎙️🎤🗨 📚📝🎞️👩🏫👨🏫💡📖 🗒️🔑💾
|
3 |
+
|
4 |
+
|
5 |
+
# Page config
|
6 |
+
st.set_page_config(
|
7 |
+
page_title='Conspectum: Video Lectures Summarization',
|
8 |
+
# page_icon='conspectum_logo.png',
|
9 |
+
page_icon='📚',
|
10 |
+
layout='wide',
|
11 |
+
menu_items={
|
12 |
+
'Get help': 'https://edu.olymponline.ru/',
|
13 |
+
'About': "# MIPT Master's :: Hackathon - Spring '25. Team 8 - Conspectum"
|
14 |
+
},
|
15 |
+
)
|
16 |
+
|
17 |
+
pg = st.navigation({
|
18 |
+
'Home':
|
19 |
+
[
|
20 |
+
# ui_home := st.Page('ui_home.py', title='Welcome', icon='🏠'),
|
21 |
+
ui_upload := st.Page('ui_upload.py', title='Upload', icon='📥'),
|
22 |
+
ui_create_summary := st.Page('ui_create_summary.py', title='Create Summary', icon='✨')
|
23 |
+
],
|
24 |
+
'Pipeline Sandbox':
|
25 |
+
[
|
26 |
+
ui_transcribe := st.Page('ui_transcribe.py', title='Transcribe', icon='🎙️'),
|
27 |
+
ui_video := st.Page('ui_video.py', title='Analyse Video', icon='🖼️'),
|
28 |
+
ui_summarize := st.Page('ui_summarize.py', title='Summarize', icon='📝'),
|
29 |
+
ui_result := st.Page('ui_result.py', title='Result', icon='✔️')
|
30 |
+
]
|
31 |
+
})
|
32 |
+
|
33 |
+
pg.run()
|
requirements.txt
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
streamlit_autorefresh
|
3 |
+
streamlit_extras
|
4 |
+
ffmpeg-python
|
5 |
+
ffmpegcv
|
6 |
+
moviepy
|
7 |
+
|
8 |
+
torch
|
9 |
+
torchvision
|
10 |
+
torchaudio
|
11 |
+
transformers
|
12 |
+
|
13 |
+
yt-dlp
|
14 |
+
|
15 |
+
openai-whisper
|
16 |
+
faster-whisper
|
17 |
+
SpeechRecognition
|
18 |
+
PyAudio
|
19 |
+
pydub
|
20 |
+
librosa
|
21 |
+
|
22 |
+
python-docx
|
23 |
+
pandas
|
24 |
+
matplotlib
|
25 |
+
|
26 |
+
pyperclip
|
27 |
+
|
28 |
+
scenedetect
|
29 |
+
easyocr
|
30 |
+
pytesseract
|
transcriber.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import whisper
|
2 |
+
from tempfile import NamedTemporaryFile
|
3 |
+
|
4 |
+
|
5 |
+
class Transcription:
|
6 |
+
def __init__(self, source):
|
7 |
+
self.source = source
|
8 |
+
# self.device = device
|
9 |
+
self.audios = []
|
10 |
+
|
11 |
+
# with NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
|
12 |
+
# tmp_file.write(file.getvalue())
|
13 |
+
# # self.audios.append(tmp_file.name)
|
14 |
+
# self.audios.append(tmp_file)
|
15 |
+
|
16 |
+
self.audios.append(source)
|
17 |
+
|
18 |
+
def transcribe(
|
19 |
+
self,
|
20 |
+
model
|
21 |
+
# whisper_model_option: str,
|
22 |
+
# translation: bool,
|
23 |
+
):
|
24 |
+
# # Get the whisper model
|
25 |
+
# transcriber = whisper.load_model(whisper_model_option, device=self.device)
|
26 |
+
|
27 |
+
self.output = []
|
28 |
+
|
29 |
+
for idx, _ in enumerate(self.audios):
|
30 |
+
# identify language
|
31 |
+
audio = whisper.load_audio(self.audios[idx])
|
32 |
+
audio = whisper.pad_or_trim(audio)
|
33 |
+
|
34 |
+
# print(model.__dict__)
|
35 |
+
# n_mels = 128 if 'large' in model.name else 80
|
36 |
+
mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device)
|
37 |
+
|
38 |
+
_, probs = model.detect_language(mel)
|
39 |
+
language = max(probs, key=probs.get)
|
40 |
+
|
41 |
+
self.raw_output = model.transcribe(
|
42 |
+
self.audios[idx],
|
43 |
+
language=language,
|
44 |
+
verbose=True,
|
45 |
+
word_timestamps=True,
|
46 |
+
# fp16=(model.device == 'cuda') # use fp16 on GPU for speed/memory
|
47 |
+
)
|
48 |
+
# if(translation):
|
49 |
+
# self.translation = model.transcribe(
|
50 |
+
# self.audios[idx],
|
51 |
+
# language=language,
|
52 |
+
# verbose=True,
|
53 |
+
# word_timestamps=True,
|
54 |
+
# task='translate'
|
55 |
+
# )["text"]
|
56 |
+
# self.raw_output["translation"] = self.translation
|
57 |
+
|
58 |
+
self.segments = self.raw_output['segments']
|
59 |
+
for segment in self.raw_output['segments']:
|
60 |
+
del segment['tokens']
|
61 |
+
|
62 |
+
self.raw_output.update(
|
63 |
+
name=self.source[idx],#.name,
|
64 |
+
language=language
|
65 |
+
)
|
66 |
+
|
67 |
+
self.output.append(self.raw_output)
|
68 |
+
print(self.raw_output['segments'])
|
ui_create_summary.py
ADDED
File without changes
|
ui_home.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
|
4 |
+
st.title('📚 Conspectum: Video Lectures Summarization 📝')
|
5 |
+
|
6 |
+
st.markdown('''
|
7 |
+
Welcome to the Video Lecture Summarizer app!
|
8 |
+
''')
|
ui_result.py
ADDED
File without changes
|
ui_summarize.py
ADDED
@@ -0,0 +1,338 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
from transformers import pipeline
|
4 |
+
import time
|
5 |
+
from docx import Document
|
6 |
+
from io import BytesIO
|
7 |
+
os.environ['STREAMLIT_SERVER_ENABLE_FILE_WATCHER'] = 'false'
|
8 |
+
import torch
|
9 |
+
from langchain_ollama.llms import OllamaLLM
|
10 |
+
# from utils import cleanup_session_files, get_session_id # for cleanup button
|
11 |
+
|
12 |
+
|
13 |
+
st.title("📝 Step 4: Lecture Notes Summarization & Structuring")
|
14 |
+
|
15 |
+
# Check if transcript and potentially OCR text are available
|
16 |
+
transcript_available = 'transcript' in st.session_state and st.session_state['transcript']
|
17 |
+
frames_available = 'frames_dir' in st.session_state and st.session_state['frames_dir']
|
18 |
+
|
19 |
+
if not transcript_available and not frames_available:
|
20 |
+
st.warning("No text content (Transcript or OCR) found. Please complete previous steps first.")
|
21 |
+
st.stop()
|
22 |
+
|
23 |
+
# st.info("This step combines the generated transcript and OCR text (if available) and creates a summary.")
|
24 |
+
|
25 |
+
# --- Combine Sources ---
|
26 |
+
st.subheader('Sources')
|
27 |
+
# combined_text = ""
|
28 |
+
source_info = []
|
29 |
+
|
30 |
+
if transcript_available:
|
31 |
+
st.success('✅ Transcript found')
|
32 |
+
# st.success(len(st.session_state.transcript.__dict__['output']))
|
33 |
+
# st.success(st.session_state.transcript.__dict__['output'][0]['text'])
|
34 |
+
# combined_text += '--- Transcript ---\n' + st.session_state.transcript['output'][0]['text'] + '\n\n'
|
35 |
+
# st.success(st.session_state.transcript.output[0]['text'])
|
36 |
+
|
37 |
+
transcript_text = st.session_state.transcript.output[0]['text']
|
38 |
+
|
39 |
+
# combined_text += '--- Transcript ---\n\n' + transcript_text + '\n\n'
|
40 |
+
|
41 |
+
# st.write(combined_text)
|
42 |
+
|
43 |
+
source_info.append('Transcript')
|
44 |
+
with st.expander('Show Transcript'):
|
45 |
+
st.text_area('Transcript', transcript_text, height=200, key='sum_transcript_disp')
|
46 |
+
else:
|
47 |
+
st.warning("Transcript not available.")
|
48 |
+
|
49 |
+
if frames_available:
|
50 |
+
st.success("✅ Extracted frames found")
|
51 |
+
# combined_text += "--- OCR results ---\n" + st.session_state['frames_dir']
|
52 |
+
source_info.append('Frames dir')
|
53 |
+
# with st.expander('Extracted frames directory'):
|
54 |
+
# st.text_area('Extracted frames directory', st.session_state['frames_dir'], height=200, key="sum_ocr_disp")
|
55 |
+
# st.text_area('Extracted frames directory', st.session_state['frames_dir'], height=200, key="sum_ocr_disp")
|
56 |
+
st.text_input('Extracted frames directory', st.session_state['frames_dir'])
|
57 |
+
else:
|
58 |
+
st.warning("OCR Text not available.")
|
59 |
+
|
60 |
+
# combined_text = combined_text.strip()
|
61 |
+
|
62 |
+
# if not combined_text:
|
63 |
+
# st.error("Combined text is empty. Cannot proceed.")
|
64 |
+
if not transcript_text:
|
65 |
+
st.error('Transcript text is empty. Cannot proceed.')
|
66 |
+
st.stop()
|
67 |
+
|
68 |
+
|
69 |
+
# --- Summarization Configuration ---
|
70 |
+
|
71 |
+
st.subheader('Summarization Settings')
|
72 |
+
# Consider different models/pipelines
|
73 |
+
summarizer_options = ['gemma3',
|
74 |
+
# 'gemma3:27b',
|
75 |
+
'phi4',
|
76 |
+
'mistral-small3.1',
|
77 |
+
|
78 |
+
# 'YandexGPT',
|
79 |
+
# 't5-base',
|
80 |
+
# 't5-large',
|
81 |
+
# 'facebook/mbart-large-50',
|
82 |
+
|
83 |
+
# 'facebook/bart-large-cnn',
|
84 |
+
# 'google/pegasus-xsum',
|
85 |
+
]
|
86 |
+
|
87 |
+
# Note: Models like Pegasus/XSUM produce very short, abstractive summaries. BART/CNN is better for longer summaries. T5 is versatile.
|
88 |
+
selected_model = st.selectbox('Select Summarization Model:', summarizer_options, index=0)
|
89 |
+
|
90 |
+
|
91 |
+
# # Dynamic length based on input size (example logic)
|
92 |
+
# # input_length = len(combined_text.split())
|
93 |
+
# input_length = len(transcript_text.split()) # approx word count
|
94 |
+
# default_min = max(50, input_length // 10) # suggest min length ~10% of input
|
95 |
+
# default_max = max(150, input_length // 3) # suggest max length ~30% of input
|
96 |
+
|
97 |
+
# min_length = st.slider("Minimum Summary Length (tokens):", min_value=30, max_value=max(500, default_max + 100), value=default_min)
|
98 |
+
# max_length = st.slider("Maximum Summary Length (tokens):", min_value=50, max_value=max(1000, default_max + 200), value=default_max)
|
99 |
+
|
100 |
+
# if min_length >= max_length:
|
101 |
+
# st.warning("Minimum length should be less than maximum length.")
|
102 |
+
# # Adjust max_length automatically or prevent proceeding
|
103 |
+
# max_length = min_length + 50 # simple adjustment
|
104 |
+
|
105 |
+
|
106 |
+
# --- Generate Summary ---
|
107 |
+
|
108 |
+
def describe_video(model, frames_dir, describe_prompt):
|
109 |
+
images = []
|
110 |
+
|
111 |
+
for file in os.listdir(frames_dir):
|
112 |
+
images.append(os.path.join(frames_dir, file))
|
113 |
+
|
114 |
+
model_with_images = model.bind(images=images)
|
115 |
+
|
116 |
+
return model_with_images.invoke(describe_prompt)
|
117 |
+
|
118 |
+
|
119 |
+
|
120 |
+
with st.expander('**Prompt**', expanded=True):
|
121 |
+
# col_1, col_2 = st.columns(2)
|
122 |
+
|
123 |
+
describe_prompt = st.text_area(label='Промпт', height=300, value='''
|
124 |
+
Ты - ассистент, который создает конспекты лекций на основе предоставленного текста. Этот текст состоит из двух частей: 1. транскрибация аудио-дорожки видеолекции, 2. Изображение выделенных из видео ключевых кадров, с полезной информацией.
|
125 |
+
|
126 |
+
Сделай детальный конспект по тому, что описывается в видео. Для иллюстрации сравнений и сопоставлений используй markdown-таблицы. Ответ предоставь в формате markdown.
|
127 |
+
Придерживайся следующей структуры:
|
128 |
+
|
129 |
+
## Содержание:
|
130 |
+
1. [Название темы 1](###Название_темы_1) (таймкод начала)
|
131 |
+
2. [Название темы 2](###Название_темы_2) (таймкод начала)
|
132 |
+
...
|
133 |
+
|
134 |
+
## Краткий конспект:
|
135 |
+
|
136 |
+
### Название_темы_1
|
137 |
+
[Текст из транскрипции, относящийся к этой теме]
|
138 |
+
|
139 |
+
[Формулы, относящиеся к этой теме]
|
140 |
+
|
141 |
+
[Таблицы, относящиеся к этой теме]
|
142 |
+
|
143 |
+
---
|
144 |
+
|
145 |
+
### Название_темы_2
|
146 |
+
[Текст из транскрипции, относящийся к этой теме]
|
147 |
+
|
148 |
+
[Формулы, относящиеся к этой теме]
|
149 |
+
|
150 |
+
[Таблицы, относящиеся к этой теме]
|
151 |
+
|
152 |
+
---
|
153 |
+
…
|
154 |
+
|
155 |
+
Здесь необходимо обратить внимание на следующие детали:
|
156 |
+
1. правильно подобрать названия тем
|
157 |
+
2. написать сжатый текст, оставляя (без сильного переформулирования) важную информацию.
|
158 |
+
3. на основе предоставленного транскрибированного аудио и текста со слайдов попытайся составить таблицы в стиле markdown. Для этого проанализируй упомянутые ключевые термины и попытайся понять как их можно сравнить.
|
159 |
+
4. Если ты понимаешь, что на некотором слайде должна быть ВАЖНАЯ формула (непосредственно относящаяся к теме занятия), которую плохо транскрибировали (или пропустили, хотя лектор её проговаривал/упоминал), то можешь привести её самостоятельно, если знаешь о ней. При этом подпиши под ней, что формулу написал ты.
|
160 |
+
|
161 |
+
Вот упомянутый транскрибированный текст:
|
162 |
+
|
163 |
+
|
164 |
+
''')
|
165 |
+
|
166 |
+
|
167 |
+
_, col_button_summary, _ = st.columns([2, 1, 2])
|
168 |
+
if col_button_summary.button('Generate Summary', type='primary', use_container_width=True):
|
169 |
+
st.session_state['summary'] = None # clear previous summary
|
170 |
+
|
171 |
+
with st.spinner(f'Performing summarization with `{selected_model}` model..'):
|
172 |
+
st.session_state.summary = describe_video(model=OllamaLLM(model=selected_model),
|
173 |
+
frames_dir=st.session_state.frames_dir,
|
174 |
+
describe_prompt=describe_prompt + transcript_text)
|
175 |
+
|
176 |
+
# if combined_text:
|
177 |
+
# with st.spinner(f"Summarizing text using {selected_model}.. Может занять некоторое время (до x2)"):
|
178 |
+
# try:
|
179 |
+
# start_time = time.time()
|
180 |
+
|
181 |
+
# # Load the pipeline - specify device if possible
|
182 |
+
# device = 0 if torch.cuda.is_available() else -1 # device=0 for first GPU, -1 for CPU
|
183 |
+
# summarizer = pipeline("summarization", model=selected_model, device=device)
|
184 |
+
|
185 |
+
# # Handle potential long input (simplistic chunking if needed, better models handle longer inputs)
|
186 |
+
# # Basic check: Transformers often have input limits (e.g., 1024 tokens for BART).
|
187 |
+
# # A more robust solution involves chunking, summarizing chunks, and combining summaries.
|
188 |
+
# # For this example, we'll try summarizing directly, but add a warning.
|
189 |
+
# max_model_input_length = getattr(summarizer.model.config, 'max_position_embeddings', 1024) # get model's max length
|
190 |
+
# if len(summarizer.tokenizer.encode(combined_text)) > max_model_input_length:
|
191 |
+
# st.warning(f'Input text might be too long for {selected_model} (max ~{max_model_input_length} tokens).' +
|
192 |
+
# f'Consider using models designed for longer text or implementing chunking.')
|
193 |
+
# # Simple Truncation (Not Ideal):
|
194 |
+
# # truncated_text = summarizer.tokenizer.decode(summarizer.tokenizer.encode(combined_text, max_length=max_model_input_length, truncation=True))
|
195 |
+
# # summary_result = summarizer(truncated_text, max_length=max_length, min_length=min_length, do_sample=False)
|
196 |
+
|
197 |
+
# # Attempt summarization (may error if too long and not handled)
|
198 |
+
# summary_result = summarizer(combined_text, max_length=max_length, min_length=min_length, do_sample=False)
|
199 |
+
# st.session_state['summary'] = summary_result[0]['summary_text']
|
200 |
+
|
201 |
+
# end_time = time.time()
|
202 |
+
# st.success(f"Summary generated in {end_time - start_time:.2f} seconds.")
|
203 |
+
|
204 |
+
# except Exception as e:
|
205 |
+
# st.error(f"Error during summarization: {e}")
|
206 |
+
# st.error("This could be due to model loading issues, insufficient memory, or input text length.")
|
207 |
+
# if 'summarizer' in locals():
|
208 |
+
# del summarizer # try to free memory
|
209 |
+
# if device == 0: torch.cuda.empty_cache()
|
210 |
+
|
211 |
+
# else:
|
212 |
+
# st.error("No text available to summarize.")
|
213 |
+
|
214 |
+
|
215 |
+
# --- Display and Refine Summary ---
|
216 |
+
# st.subheader('Summary')
|
217 |
+
|
218 |
+
if 'summary' in st.session_state and st.session_state['summary']:
|
219 |
+
with st.container(height=600, border=True):
|
220 |
+
summary_container = st.empty()
|
221 |
+
edited_summary = st.session_state['summary']
|
222 |
+
|
223 |
+
# summary_container.markdown(st.session_state['summary'])
|
224 |
+
summary_container.markdown(edited_summary, unsafe_allow_html=True)
|
225 |
+
|
226 |
+
_, col_button_render, _ = st.columns([2, 1, 2])
|
227 |
+
|
228 |
+
# Use st.text_area for editing
|
229 |
+
edited_summary = st.text_area(
|
230 |
+
'Edit the summary here (Markdown format supported):',
|
231 |
+
value=st.session_state['summary'],
|
232 |
+
height=400,
|
233 |
+
key='summary_edit_area'
|
234 |
+
)
|
235 |
+
|
236 |
+
if col_button_render.button('Render Markdown', type='secondary', use_container_width=True):
|
237 |
+
with st.spinner('Generating Markdown preview..'):
|
238 |
+
# st.markdown(edited_summary, unsafe_allow_html=True)
|
239 |
+
summary_container.markdown(edited_summary, unsafe_allow_html=True)
|
240 |
+
# st.session_state['summary'] = edited_summary # update summary
|
241 |
+
# else:
|
242 |
+
# st.markdown('', unsafe_allow_html=True)
|
243 |
+
|
244 |
+
|
245 |
+
# --- Export Options ---
|
246 |
+
st.subheader('📥 Export Notes (Download)')
|
247 |
+
col_export_md, col_export_docx, col_export_pdf = st.columns(3)
|
248 |
+
|
249 |
+
st.session_state['final_notes'] = edited_summary # store edited version
|
250 |
+
final_notes_md = st.session_state.get('final_notes', '')
|
251 |
+
|
252 |
+
# 1. Markdown (.md) export
|
253 |
+
col_export_md.download_button(
|
254 |
+
label="📥 Markdown (.md)",
|
255 |
+
data=final_notes_md,
|
256 |
+
file_name="lecture_notes.md",
|
257 |
+
mime="text/markdown",
|
258 |
+
use_container_width=True,
|
259 |
+
)
|
260 |
+
|
261 |
+
# 2. Word (.docx) export
|
262 |
+
try:
|
263 |
+
doc = Document()
|
264 |
+
doc.add_heading('Lecture Notes Summary', 0)
|
265 |
+
# Add basic Markdown conversion (very simple - assumes paragraphs)
|
266 |
+
# For full Markdown -> Docx, a library like 'pandoc' (external) or more complex parsing is needed.
|
267 |
+
paragraphs = final_notes_md.split('\n\n') # split by double newline
|
268 |
+
for para in paragraphs:
|
269 |
+
if para.strip(): # avoid empty paragraphs
|
270 |
+
# Basic handling for potential markdown emphasis (crude)
|
271 |
+
# A proper Markdown parser would be better here
|
272 |
+
cleaned_para = para.replace('*', '').replace('_', '').replace('#', '').strip()
|
273 |
+
doc.add_paragraph(cleaned_para)
|
274 |
+
|
275 |
+
# Save docx to a BytesIO buffer
|
276 |
+
buffer = BytesIO()
|
277 |
+
doc.save(buffer)
|
278 |
+
buffer.seek(0)
|
279 |
+
|
280 |
+
col_export_docx.download_button(
|
281 |
+
label='📥 Word (.docx)',
|
282 |
+
data=buffer,
|
283 |
+
file_name='lecture_notes.docx',
|
284 |
+
mime='application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
285 |
+
use_container_width=True
|
286 |
+
)
|
287 |
+
except Exception as docx_e:
|
288 |
+
st.error(f'Failed to generate .docx file: {docx_e}')
|
289 |
+
|
290 |
+
# 3. PDF (.pdf) export
|
291 |
+
try:
|
292 |
+
col_export_pdf.download_button(
|
293 |
+
label='📥 PDF (.pdf)',
|
294 |
+
data=buffer,
|
295 |
+
file_name="lecture_notes.pdf",
|
296 |
+
use_container_width=True,
|
297 |
+
# mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
298 |
+
disabled=True
|
299 |
+
)
|
300 |
+
except Exception as pdf_e:
|
301 |
+
st.error(f'Failed to generate .pdf file: {pdf_e}')
|
302 |
+
|
303 |
+
|
304 |
+
|
305 |
+
# 3. PDF Export (Requires extra libraries/setup - Placeholder)
|
306 |
+
# st.markdown("---")
|
307 |
+
# st.write("**PDF Export:**")
|
308 |
+
# try:
|
309 |
+
# from mdpdf.cli import mdpdf
|
310 |
+
# pdf_buffer = BytesIO()
|
311 |
+
# # This often requires command-line execution or careful API usage
|
312 |
+
# # Simplified placeholder - actual implementation may vary:
|
313 |
+
# # mdpdf(pdf_buffer, md=final_notes_md, ...) # Fictional direct API call
|
314 |
+
# st.info("PDF generation via libraries like mdpdf/WeasyPrint requires setup.")
|
315 |
+
|
316 |
+
# except ImportError:
|
317 |
+
# st.warning("`mdpdf` library not installed. PDF export unavailable.")
|
318 |
+
# except Exception as pdf_e:
|
319 |
+
# st.error(f"Failed to generate PDF (requires setup): {pdf_e}")
|
320 |
+
|
321 |
+
|
322 |
+
else:
|
323 |
+
st.info('Summary has not been generated or is empty.')
|
324 |
+
|
325 |
+
|
326 |
+
# --- Optional: Cleanup Button ---
|
327 |
+
# st.sidebar.markdown("---")
|
328 |
+
# if st.sidebar.button("End Session & Clean Up Files"):
|
329 |
+
# session_id = get_session_id()
|
330 |
+
# cleanup_session_files(session_id)
|
331 |
+
# # Clear relevant session state keys
|
332 |
+
# keys_to_clear = ['video_path', 'audio_path', 'frames_dir', 'transcript', 'summary', 'final_notes', 'extracted_frames', 'session_id']
|
333 |
+
# for key in keys_to_clear:
|
334 |
+
# if key in st.session_state:
|
335 |
+
# del st.session_state[key]
|
336 |
+
# st.success("Temporary files cleaned and session data cleared.")
|
337 |
+
# st.info("You can now start a new session from the 'Main' page.")
|
338 |
+
# # Consider navigating back to Main page or just showing message
|
ui_transcribe.py
ADDED
@@ -0,0 +1,284 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from streamlit_extras.stylable_container import stylable_container
|
3 |
+
|
4 |
+
import os
|
5 |
+
import time
|
6 |
+
import pathlib
|
7 |
+
from datetime import timedelta
|
8 |
+
|
9 |
+
os.environ['STREAMLIT_SERVER_ENABLE_FILE_WATCHER'] = 'false'
|
10 |
+
import whisper # openai-whisper
|
11 |
+
import torch # check for GPU availability
|
12 |
+
|
13 |
+
# from models.loader import load_model_sst
|
14 |
+
|
15 |
+
from transcriber import Transcription
|
16 |
+
import matplotlib.colors as mcolors
|
17 |
+
|
18 |
+
|
19 |
+
st.title('🎙️ Step 2: Speech-to-Text (ASR/STT)')
|
20 |
+
|
21 |
+
# Check if audio path exists from previous step
|
22 |
+
if 'audio_path' not in st.session_state or not st.session_state['audio_path'] or not os.path.exists(st.session_state['audio_path']):
|
23 |
+
st.warning('Audio file not found. Please go back to the "**📤 Upload**" page and process a video first.')
|
24 |
+
st.stop()
|
25 |
+
|
26 |
+
audio_path = st.session_state['audio_path']
|
27 |
+
|
28 |
+
|
29 |
+
# st.write(f'Audio file to process: `{os.path.basename(audio_path)}`')
|
30 |
+
st.write(f'Processing audio `{st.session_state.video_input_title}` from video input')
|
31 |
+
|
32 |
+
if 'start_time' not in st.session_state:
|
33 |
+
st.session_state.start_time = 0
|
34 |
+
|
35 |
+
# st.audio(audio_path)
|
36 |
+
# format='audio/wav',
|
37 |
+
st.audio(audio_path, start_time=st.session_state.start_time)
|
38 |
+
|
39 |
+
#
|
40 |
+
# ==================================================================
|
41 |
+
#
|
42 |
+
|
43 |
+
col_model, col_config = st.columns(2)
|
44 |
+
|
45 |
+
# --- Model ---
|
46 |
+
# with col_model.expander('**MODEL**', expanded=True):
|
47 |
+
with col_model.container(border=True):
|
48 |
+
model_option = st.selectbox(
|
49 |
+
'SST Model:',
|
50 |
+
['whisper', 'faster-whisper', 'distill-whisper', 'giga'],
|
51 |
+
index=0
|
52 |
+
)
|
53 |
+
|
54 |
+
|
55 |
+
# sst_model = load_model_sst(model_option)
|
56 |
+
|
57 |
+
|
58 |
+
# --- Configuration ---
|
59 |
+
with col_config.expander('**CONFIG**', expanded=True):
|
60 |
+
# Determine device
|
61 |
+
default_device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
62 |
+
device = st.radio(
|
63 |
+
'Compute device:',
|
64 |
+
('cuda', 'cpu'),
|
65 |
+
index=0 if default_device == 'cuda' else 1,
|
66 |
+
horizontal=True,
|
67 |
+
disabled=not torch.cuda.is_available()
|
68 |
+
)
|
69 |
+
|
70 |
+
if device == 'cuda' and not torch.cuda.is_available():
|
71 |
+
st.warning('CUDA selected but not available, falling back to CPU')
|
72 |
+
device = 'cpu'
|
73 |
+
|
74 |
+
whisper_model_option = st.selectbox(
|
75 |
+
'Whisper model type:',
|
76 |
+
['tiny', 'base', 'small', 'medium', 'large', 'turbo'],
|
77 |
+
index=5
|
78 |
+
)
|
79 |
+
|
80 |
+
pauses = st.checkbox('pauses', value=False)
|
81 |
+
|
82 |
+
# from models.models_sst import Whisper
|
83 |
+
# Whisper.config()
|
84 |
+
|
85 |
+
|
86 |
+
##
|
87 |
+
## --- Transcription ---
|
88 |
+
##
|
89 |
+
|
90 |
+
_, col_button_trancribe, _ = st.columns([2, 1, 2])
|
91 |
+
if col_button_trancribe.button('Transcribe', type='primary', use_container_width=True):
|
92 |
+
# if input_files:
|
93 |
+
# pass
|
94 |
+
# else:
|
95 |
+
# st.error("Please select a file")
|
96 |
+
st.session_state.transcript = None # clear previous transcript
|
97 |
+
col_info, col_complete, col_next = st.columns(3)
|
98 |
+
|
99 |
+
try:
|
100 |
+
with st.spinner(f'Loading Whisper `{whisper_model_option}` model and transcribing..'):
|
101 |
+
#-- Load whisper model
|
102 |
+
start = time.time()
|
103 |
+
# Let Whisper handle device placement if possible
|
104 |
+
model = whisper.load_model(whisper_model_option, device=device)
|
105 |
+
# load_time =
|
106 |
+
col_info.info(f'Model loaded in {time.time() - start:.2f} seconds.')
|
107 |
+
|
108 |
+
#-- Perform transcription
|
109 |
+
start = time.time()
|
110 |
+
# print('################################')
|
111 |
+
# print(st.session_state.audio_path)
|
112 |
+
# print('################################')
|
113 |
+
|
114 |
+
st.session_state.transcript = Transcription(st.session_state.audio_path)
|
115 |
+
# st.session_state.transcript = Transcription([audio_path])
|
116 |
+
# st.session_state.transcript.transcribe(whisper_model_option)
|
117 |
+
# st.markdown(model.name)
|
118 |
+
st.session_state.transcript.transcribe(model)
|
119 |
+
# result = model.transcribe(audio_path, fp16=(device == 'cuda')) # use fp16 on GPU for speed/memory
|
120 |
+
transcribe_time = time.time() - start
|
121 |
+
|
122 |
+
# st.session_state['transcript'] = result['text']
|
123 |
+
# st.session_state['transcript'] = st.session_state.transcript
|
124 |
+
# Store segments for timestamping/structuring later
|
125 |
+
|
126 |
+
# print(len(st.session_state.transcript['segments']))
|
127 |
+
# st.session_state['transcript_segments'] = st.session_state.transcript['segments']
|
128 |
+
|
129 |
+
col_complete.success(f'Transcription complete! (Took {transcribe_time:.2f}s)')
|
130 |
+
|
131 |
+
col_next.page_link('ui_video.py', label='Next Step: **🖼️ Analyze Video**', icon='➡️')
|
132 |
+
|
133 |
+
except Exception as e:
|
134 |
+
st.error(f'An error occurred during transcription: {e}')
|
135 |
+
# Consider unloading model if error occurs to free memory
|
136 |
+
if 'model' in locals():
|
137 |
+
del model
|
138 |
+
if device == 'cuda':
|
139 |
+
torch.cuda.empty_cache()
|
140 |
+
|
141 |
+
|
142 |
+
# --- Video Player ---
|
143 |
+
with st.expander('**Video Player**', expanded=True):
|
144 |
+
col_video, col_segments = st.columns(2)
|
145 |
+
col_video.video(st.session_state.video_path, start_time=st.session_state.start_time)
|
146 |
+
|
147 |
+
|
148 |
+
# --- Display Transcript ---
|
149 |
+
if 'transcript' in st.session_state and st.session_state['transcript']:
|
150 |
+
st.markdown('#### Transcription')
|
151 |
+
|
152 |
+
output = st.session_state.transcript.output[0]
|
153 |
+
# doc = docx.Document()
|
154 |
+
avg_confidence_score = 0
|
155 |
+
amount_words = 0
|
156 |
+
save_dir = str(pathlib.Path(__file__).parent.absolute()) + '/transcripts/'
|
157 |
+
|
158 |
+
for idx, segment in enumerate(output['segments']):
|
159 |
+
for w in output['segments'][idx]['words']:
|
160 |
+
amount_words += 1
|
161 |
+
avg_confidence_score += w['probability']
|
162 |
+
|
163 |
+
st.badge(
|
164 |
+
f'whisper model: **`{whisper_model_option}`** | ' +
|
165 |
+
f'language: **`{output["language"]}`** | ' +
|
166 |
+
f'confidence score: **`{round(avg_confidence_score / amount_words, 3)}`**'
|
167 |
+
)
|
168 |
+
prev_word_end = -1
|
169 |
+
text = ""
|
170 |
+
html_text = ""
|
171 |
+
|
172 |
+
# Define the color map
|
173 |
+
colors = [(0.6, 0, 0), (1, 0.7, 0), (0, 0.6, 0)]
|
174 |
+
cmap = mcolors.LinearSegmentedColormap.from_list('my_colormap', colors)
|
175 |
+
|
176 |
+
|
177 |
+
with st.expander('**TRANSCRIPT**', expanded=True):
|
178 |
+
color_coding = st.checkbox(
|
179 |
+
'color coding',
|
180 |
+
value=True,
|
181 |
+
# key={i},
|
182 |
+
help='Цветное кодирование слов в зависимости от вероятности правильного распознавания: от зелёного (хорошо) до красного (плохо)'
|
183 |
+
)
|
184 |
+
|
185 |
+
# https://docs.streamlit.io/develop/api-reference/layout/st.container
|
186 |
+
with st.container(height=300, border=False):
|
187 |
+
for idx, segment in enumerate(output['segments']):
|
188 |
+
for w in output['segments'][idx]['words']:
|
189 |
+
# check for pauses in speech longer than 3s
|
190 |
+
if pauses and prev_word_end != -1 and w['start'] - prev_word_end >= 3:
|
191 |
+
pause = w['start'] - prev_word_end
|
192 |
+
pause_int = int(pause)
|
193 |
+
html_text += f'{"." * pause_int}{{{pause_int}sec}}'
|
194 |
+
text += f'{"." * pause_int}{{{pause_int}sec}}'
|
195 |
+
prev_word_end = w['end']
|
196 |
+
if (color_coding):
|
197 |
+
rgba_color = cmap(w['probability'])
|
198 |
+
rgb_color = tuple(round(x * 255)
|
199 |
+
for x in rgba_color[:3])
|
200 |
+
else:
|
201 |
+
rgb_color = (0, 0, 0)
|
202 |
+
html_text += f"<span style='color:rgb{rgb_color}'>{w['word']}</span>"
|
203 |
+
text += w['word']
|
204 |
+
# insert line break if there is a punctuation mark
|
205 |
+
if any(c in w['word'] for c in '!?.') and not any(c.isdigit() for c in w['word']):
|
206 |
+
html_text += '<br><br>'
|
207 |
+
text += '\n\n'
|
208 |
+
st.markdown(html_text, unsafe_allow_html=True)
|
209 |
+
# doc.add_paragraph(text)
|
210 |
+
|
211 |
+
# if (translation):
|
212 |
+
# with st.expander("English translation"):
|
213 |
+
# st.markdown(output["translation"], unsafe_allow_html=True)
|
214 |
+
|
215 |
+
# # save transcript as docx. in local folder
|
216 |
+
# file_name = output['name'] + "-" + whisper_model + \
|
217 |
+
# "-" + datetime.today().strftime('%d-%m-%y') + ".docx"
|
218 |
+
# doc.save(save_dir + file_name)
|
219 |
+
|
220 |
+
# bio = io.BytesIO()
|
221 |
+
# doc.save(bio)
|
222 |
+
# st.download_button(
|
223 |
+
# label="Download Transcription",
|
224 |
+
# data=bio.getvalue(),
|
225 |
+
# file_name=file_name,
|
226 |
+
# mime="docx"
|
227 |
+
# )
|
228 |
+
|
229 |
+
|
230 |
+
# --- Display Segments with timestamps ---
|
231 |
+
# if 'segments' in st.session_state.transcript:
|
232 |
+
# with st.expander('Detailed segments (with timestamps)'):
|
233 |
+
# st.json(st.session_state.transcript['segments'])
|
234 |
+
|
235 |
+
format_time = lambda s: str(timedelta(seconds=int(s)))
|
236 |
+
|
237 |
+
# st.write(st.session_state.transcript.output[0]['segments'])
|
238 |
+
|
239 |
+
|
240 |
+
# https://discuss.streamlit.io/t/replaying-an-audio-file-with-a-timecode-click/48892/9
|
241 |
+
# with col_segments.expander('**SEGMENTS**', expanded=True):
|
242 |
+
# with col_segments.container('**SEGMENTS**', expanded=True):
|
243 |
+
# https://docs.streamlit.io/develop/api-reference/layout/st.container
|
244 |
+
with col_segments.container(height=400, border=False):
|
245 |
+
# Style buttons as links
|
246 |
+
with stylable_container(
|
247 |
+
key='link_buttons',
|
248 |
+
css_styles='''
|
249 |
+
button {
|
250 |
+
background: none!important;
|
251 |
+
border: none;
|
252 |
+
padding: 0!important;
|
253 |
+
font-family: arial, sans-serif;
|
254 |
+
color: #069;
|
255 |
+
cursor: pointer;
|
256 |
+
}
|
257 |
+
''',
|
258 |
+
):
|
259 |
+
for i, segment in enumerate(st.session_state.transcript.output[0]['segments']):
|
260 |
+
start = format_time(segment['start'])
|
261 |
+
end = format_time(segment['end'])
|
262 |
+
text = segment['text'].strip()
|
263 |
+
|
264 |
+
# 🕒Segment {i + 1}
|
265 |
+
# st.badge(f'**[{start} - {end}]** {text}', color='gray')
|
266 |
+
# st.markdown(
|
267 |
+
# f':violet-badge[**{start} - {end}**] :gray-badge[{text}]'
|
268 |
+
# )
|
269 |
+
|
270 |
+
col_timecode, col_text = st.columns([1, 5])
|
271 |
+
# seg_text = f':violet-badge[**{start} - {end}**] :gray-badge[{text}]'
|
272 |
+
if col_timecode.button(f':violet-badge[**{start} – {end}**]', use_container_width=True):
|
273 |
+
st.session_state['start_time'] = start
|
274 |
+
st.rerun()
|
275 |
+
|
276 |
+
# col_text.markdown(f':gray-badge[`{text}`]')
|
277 |
+
# col_text.write('#')
|
278 |
+
# col_text.markdown(f'<div style="text-align: bottom;">:gray-badge[{text}]</div>', unsafe_allow_html=True)
|
279 |
+
col_text.text(f'{text}')
|
280 |
+
# col_text.badge(text, color='gray')
|
281 |
+
|
282 |
+
|
283 |
+
# else:
|
284 |
+
# st.info('Transcript has not been generated yet.')
|
ui_upload.py
ADDED
@@ -0,0 +1,320 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
import time
|
4 |
+
|
5 |
+
from yt_dlp import YoutubeDL
|
6 |
+
import ffmpeg
|
7 |
+
import tempfile
|
8 |
+
|
9 |
+
from utils import (save_uploaded_file, extract_audio,
|
10 |
+
download_youtube, get_session_dir,
|
11 |
+
cleanup_session_files, get_session_id,
|
12 |
+
get_temp_dir, get_features, proc_raw_audio)
|
13 |
+
|
14 |
+
|
15 |
+
st.title('📥📄 Step 1: Upload Video & Preprocess')
|
16 |
+
|
17 |
+
|
18 |
+
# Initialize session state defaults
|
19 |
+
defaults = {
|
20 |
+
'uploaded_file': None,
|
21 |
+
'video_path': None,
|
22 |
+
'audio_path': None,
|
23 |
+
'ocr_text': None,
|
24 |
+
'transcript': None,
|
25 |
+
'summary': None,
|
26 |
+
|
27 |
+
'main_topic': None,
|
28 |
+
|
29 |
+
'input_method': 'Upload',
|
30 |
+
'input_title': None,
|
31 |
+
|
32 |
+
'video_input_path': None,
|
33 |
+
'video_url': None,
|
34 |
+
|
35 |
+
'audio_wav': None,
|
36 |
+
'audio_file': None,
|
37 |
+
}
|
38 |
+
|
39 |
+
for key, value in defaults.items():
|
40 |
+
st.session_state.setdefault(key, value)
|
41 |
+
|
42 |
+
|
43 |
+
# --- Option to clear previous session ---
|
44 |
+
st.sidebar.write('Current Session ID:')
|
45 |
+
st.sidebar.write(f'`{get_session_id()}`') # session ID for debugging
|
46 |
+
|
47 |
+
if st.sidebar.button('Start New Session'):
|
48 |
+
session_id = get_session_id() # get current ID before clearing
|
49 |
+
cleanup_session_files(session_id)
|
50 |
+
for key in list(st.session_state.keys()):
|
51 |
+
del st.session_state[key] # clear all session state
|
52 |
+
st.rerun() # rerun the script to reflect cleared state
|
53 |
+
|
54 |
+
|
55 |
+
# --- Main Topic ---
|
56 |
+
st.session_state.main_topic = st.text_input('Enter video topic:', st.session_state.main_topic)
|
57 |
+
# st.session_state.main_topic = m
|
58 |
+
# col_url, col_start_from = st.columns([5, 2])
|
59 |
+
# video_url = col_url.text_input('Enter YouTube video URL:', example_youtube['url'])
|
60 |
+
# start_from = col_start_from.number_input(
|
61 |
+
# 'Start From:',
|
62 |
+
# min_value=0.0, step=0.5, format='%f', value=example_youtube['start'],
|
63 |
+
# help='Time shift from the beginning (in seconds)'
|
64 |
+
# )
|
65 |
+
|
66 |
+
# if video_url:
|
67 |
+
# st.session_state.video_url = video_url
|
68 |
+
# st.session_state.video_input_path = '' # clear path if URL is used
|
69 |
+
|
70 |
+
|
71 |
+
|
72 |
+
# --- Video source selection ---
|
73 |
+
input_method = st.radio(
|
74 |
+
'Select Input Method:',
|
75 |
+
('Upload', 'YouTube'),
|
76 |
+
key='input_method',
|
77 |
+
horizontal=True
|
78 |
+
)
|
79 |
+
|
80 |
+
video_path = None
|
81 |
+
uploaded_file = None
|
82 |
+
video_url = None
|
83 |
+
|
84 |
+
|
85 |
+
if input_method == 'Upload':
|
86 |
+
uploaded_file = st.file_uploader(
|
87 |
+
'Choose a video file',
|
88 |
+
type=['mp4', 'avi', 'mkv', 'mov']
|
89 |
+
)
|
90 |
+
|
91 |
+
if uploaded_file:
|
92 |
+
col_info, col_ready = st.columns(2)
|
93 |
+
|
94 |
+
# Display basic file info
|
95 |
+
col_info.info('**[ File Details ]** ' +
|
96 |
+
f'name: `{uploaded_file.name}` | ' +
|
97 |
+
f'type: `{uploaded_file.type}` | ' +
|
98 |
+
f'size: `{uploaded_file.size / (1024 * 1024):.2f} MB`')
|
99 |
+
|
100 |
+
# Save uploaded file temporarily for the Prefect flow
|
101 |
+
temp_dir = get_temp_dir() # use a shared temp location
|
102 |
+
# Use a unique name to avoid conflicts if multiple users run simultaneously
|
103 |
+
target_path = os.path.join(temp_dir, f'upload_{get_session_id()}_{uploaded_file.name}')
|
104 |
+
try:
|
105 |
+
with open(target_path, 'wb') as f:
|
106 |
+
f.write(uploaded_file.getbuffer())
|
107 |
+
st.session_state.video_input_path = target_path
|
108 |
+
st.session_state.video_input_title = uploaded_file.name
|
109 |
+
st.session_state.video_url = '' # clear URL if file is uploaded
|
110 |
+
st.session_state.transcript = None
|
111 |
+
st.session_state.summary = None
|
112 |
+
col_ready.info('Ready for processing.')
|
113 |
+
|
114 |
+
except Exception as e:
|
115 |
+
col_ready.error(f'Error saving uploaded file: {e}')
|
116 |
+
st.session_state.video_input_path = ''
|
117 |
+
|
118 |
+
|
119 |
+
elif input_method == 'YouTube':
|
120 |
+
#-- Obtain audio from YouTube video
|
121 |
+
example_youtube = {
|
122 |
+
'title': 'Общественное движение',
|
123 |
+
'url': 'https://www.youtube.com/watch?v=c3bhkrKF6F4',
|
124 |
+
'start': 0.0
|
125 |
+
}
|
126 |
+
|
127 |
+
col_url, col_start_from = st.columns([5, 2])
|
128 |
+
video_url = col_url.text_input('Enter YouTube video URL:', example_youtube['url'])
|
129 |
+
start_from = col_start_from.number_input(
|
130 |
+
'Start From:',
|
131 |
+
min_value=0.0, step=0.5, format='%f', value=example_youtube['start'],
|
132 |
+
help='Time shift from the beginning (in seconds)'
|
133 |
+
)
|
134 |
+
|
135 |
+
if video_url:
|
136 |
+
st.session_state.video_url = video_url
|
137 |
+
st.session_state.video_input_path = '' # clear path if URL is used
|
138 |
+
|
139 |
+
|
140 |
+
@st.cache_resource
|
141 |
+
def ui_processed_sound(audio_wav, audio_np):
|
142 |
+
'''UI to show sound processing results'''
|
143 |
+
st.audio(audio_wav)
|
144 |
+
features = get_features(audio_np)
|
145 |
+
|
146 |
+
|
147 |
+
@st.cache_resource
|
148 |
+
def extract_videofile(video_file):
|
149 |
+
# video_buffer = BytesIO(video_file.read())
|
150 |
+
# audio_data = VideoFileClip(video_buffer.name).audio
|
151 |
+
|
152 |
+
# raw_source = StringIO(video_file.getvalue().decode('utf-8'))
|
153 |
+
# raw_source = video_file.getvalue().decode('utf-8')
|
154 |
+
# raw_source = video_file.read()
|
155 |
+
# raw_source = BytesIO(video_file.getvalue())
|
156 |
+
|
157 |
+
#-- Get video
|
158 |
+
# out, err = (
|
159 |
+
# ffmpeg
|
160 |
+
# .input(video_file, ss=start_from)
|
161 |
+
# .output('temp.mp4', vcodec='copy')
|
162 |
+
# .overwrite_output()
|
163 |
+
# .run()
|
164 |
+
# )
|
165 |
+
# st.video('temp.mp4')
|
166 |
+
|
167 |
+
# video = VideoFileClip(video_file)
|
168 |
+
# audio = video.audio
|
169 |
+
# audio.write_audiofile('output_audio.mp3')
|
170 |
+
|
171 |
+
tfile = tempfile.NamedTemporaryFile(delete=False)
|
172 |
+
tfile.write(video_file.read())
|
173 |
+
|
174 |
+
#-- Get audio
|
175 |
+
# SAMPLE_RATE = 16000
|
176 |
+
audio_data, err = (
|
177 |
+
ffmpeg
|
178 |
+
.input(tfile.name, ss=start_from)
|
179 |
+
.output('pipe:', format='wav')#, acodec='pcm_s16le')
|
180 |
+
# .output('pipe:', format='s16le', ac=1, acodec='pcm_s16le', ar=SAMPLE_RATE)
|
181 |
+
# .global_args('-nostdin', '-threads', '0')
|
182 |
+
.run(capture_stdout=True)
|
183 |
+
)
|
184 |
+
if err:
|
185 |
+
raise RuntimeError(f'Failed to load audio: {err.decode()}')
|
186 |
+
|
187 |
+
return audio_data
|
188 |
+
|
189 |
+
|
190 |
+
@st.cache_resource
|
191 |
+
def extract_youtube(raw_url):
|
192 |
+
#-- Get video
|
193 |
+
# out, err = (
|
194 |
+
# ffmpeg
|
195 |
+
# .input(raw_url, ss=start_from)
|
196 |
+
# .output('temp.mp4', vcodec='copy')
|
197 |
+
# .overwrite_output()
|
198 |
+
# .run()
|
199 |
+
# )
|
200 |
+
# st.video('temp.mp4')
|
201 |
+
|
202 |
+
#-- Get audio
|
203 |
+
# SAMPLE_RATE = 16000
|
204 |
+
audio_data, err = (
|
205 |
+
ffmpeg
|
206 |
+
.input(raw_url, ss=start_from)
|
207 |
+
.output('pipe:', format='wav')#, acodec='pcm_s16le')
|
208 |
+
# .output('pipe:', format='s16le', ac=1, acodec='pcm_s16le', ar=SAMPLE_RATE)
|
209 |
+
.global_args('-nostdin', '-threads', '0')
|
210 |
+
.run(capture_stdout=True)
|
211 |
+
)
|
212 |
+
if err:
|
213 |
+
raise RuntimeError(f'Failed to load audio: {err.decode()}')
|
214 |
+
|
215 |
+
return audio_data
|
216 |
+
|
217 |
+
|
218 |
+
|
219 |
+
|
220 |
+
# --- Processing Button ---
|
221 |
+
if st.button('Process video input',
|
222 |
+
type='primary',
|
223 |
+
disabled=not (st.session_state.video_input_path or st.session_state.video_url)
|
224 |
+
):
|
225 |
+
# Clear previous paths if reprocessing
|
226 |
+
st.session_state['video_path'] = None
|
227 |
+
st.session_state['audio_path'] = None
|
228 |
+
|
229 |
+
col_info, col_complete, col_next = st.columns(3)
|
230 |
+
|
231 |
+
with st.spinner('Processing video input..'):
|
232 |
+
if st.session_state['input_method'] == 'Upload' and uploaded_file:
|
233 |
+
st.session_state.uploaded_file = uploaded_file
|
234 |
+
video = uploaded_file
|
235 |
+
# audio_data = extract_videofile(uploaded_file)
|
236 |
+
|
237 |
+
saved_path = save_uploaded_file(uploaded_file)
|
238 |
+
if saved_path:
|
239 |
+
st.session_state['video_path'] = saved_path
|
240 |
+
col_info.success(f'Video saved temporarily to: {os.path.basename(saved_path)}')
|
241 |
+
else:
|
242 |
+
col_info.error('Failed to save uploaded file')
|
243 |
+
|
244 |
+
elif st.session_state['input_method'] == 'YouTube' and video_url:
|
245 |
+
try:
|
246 |
+
with YoutubeDL({'format': 'best+bestaudio'}) as ydl:
|
247 |
+
info = ydl.extract_info(video_url, download=False)
|
248 |
+
except Exception as e:
|
249 |
+
st.error(e)
|
250 |
+
else:
|
251 |
+
st.write(f"<small><div style='float: center; text-align: center'>\
|
252 |
+
**Title:** [{info['title']}]({video_url})\
|
253 |
+
**Duration:** {info['duration']} sec.</div></small>",
|
254 |
+
unsafe_allow_html=True)
|
255 |
+
|
256 |
+
video = video_url
|
257 |
+
# audio_data = extract_youtube(info['url'])
|
258 |
+
st.session_state.video_input_title = info['title']
|
259 |
+
|
260 |
+
session_dir = get_session_dir()
|
261 |
+
os.makedirs(session_dir, exist_ok=True)
|
262 |
+
downloaded_path = download_youtube(video_url, session_dir)
|
263 |
+
if downloaded_path and os.path.exists(downloaded_path):
|
264 |
+
st.session_state['video_path'] = downloaded_path
|
265 |
+
col_info.success(f'YouTube video downloaded: {os.path.basename(downloaded_path)}')
|
266 |
+
else:
|
267 |
+
col_info.error('Failed to download YouTube video')
|
268 |
+
|
269 |
+
else:
|
270 |
+
st.warning('Please upload a file or provide a YouTube URL')
|
271 |
+
st.stop()
|
272 |
+
|
273 |
+
|
274 |
+
# --- Basic Preprocessing: Audio Extraction ---
|
275 |
+
if st.session_state['video_path']:
|
276 |
+
# st.write('Extracting audio..')
|
277 |
+
start = time.time()
|
278 |
+
# Ensure utils.extract_audio uses the correct path
|
279 |
+
audio_path = extract_audio(st.session_state['video_path'])
|
280 |
+
end = time.time()
|
281 |
+
if audio_path and os.path.exists(audio_path):
|
282 |
+
st.session_state['audio_path'] = audio_path
|
283 |
+
col_info.success(f'Audio extracted to: {os.path.basename(audio_path)} (took {end - start:.2f}s)')
|
284 |
+
else:
|
285 |
+
col_info.error('Failed to extract audio from the video')
|
286 |
+
st.warning('Proceeding without audio. STT step will be skipped')
|
287 |
+
st.session_state['audio_path'] = None # explicitly set to None
|
288 |
+
|
289 |
+
if st.session_state['video_path']:
|
290 |
+
col_complete.info('Preprocessing complete')
|
291 |
+
col_next.page_link('ui_transcribe.py', label='Next Step: 🎙️ **Transcribe**', icon='➡️')
|
292 |
+
|
293 |
+
|
294 |
+
# Display video
|
295 |
+
st.subheader('Video Player')
|
296 |
+
_, col_video, _ = st.columns([1, 3, 1])
|
297 |
+
col_video.video(video)
|
298 |
+
|
299 |
+
# audio_data = audio_path
|
300 |
+
# audio_wav, audio_np = proc_raw_audio(audio_data)
|
301 |
+
|
302 |
+
# st.session_state.audio_wav = audio_wav
|
303 |
+
# st.session_state.audio_np = audio_np
|
304 |
+
|
305 |
+
# # st.session_state.video = video.read()
|
306 |
+
|
307 |
+
# ui_processed_sound(audio_wav, audio_np)
|
308 |
+
|
309 |
+
|
310 |
+
# # Display current status
|
311 |
+
# st.subheader("Current Status:")
|
312 |
+
# if st.session_state.get('video_path'):
|
313 |
+
# st.success(f"✅ Video Loaded: {os.path.basename(st.session_state['video_path'])}")
|
314 |
+
# else:
|
315 |
+
# st.warning("⏳ Video not yet loaded or processed.")
|
316 |
+
|
317 |
+
# if st.session_state.get('audio_path'):
|
318 |
+
# st.success(f"✅ Audio Extracted: {os.path.basename(st.session_state['audio_path'])}")
|
319 |
+
# elif st.session_state.get('video_path'): # only show warning if video was loaded but audio failed
|
320 |
+
# st.warning("⚠️ Audio extraction failed or video has no audio track.")
|
ui_video.py
ADDED
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
import pytesseract
|
4 |
+
from PIL import Image
|
5 |
+
import time
|
6 |
+
from utils import extract_frames_interval, extract_frames_pyscenedetect
|
7 |
+
|
8 |
+
|
9 |
+
st.title('🖼️ Step 3: Video Processing (Frame Extraction & OCR)')
|
10 |
+
|
11 |
+
|
12 |
+
# Check if video path exists
|
13 |
+
if ('video_path' not in st.session_state or
|
14 |
+
not st.session_state['video_path'] or
|
15 |
+
not os.path.exists(st.session_state['video_path'])
|
16 |
+
):
|
17 |
+
st.warning('Video file not found. Please go back to the **📤 Upload** page and process a video first.')
|
18 |
+
st.stop()
|
19 |
+
|
20 |
+
video_path = st.session_state['video_path']
|
21 |
+
st.write(f'Video file to process: `{os.path.basename(video_path)}`')
|
22 |
+
|
23 |
+
#
|
24 |
+
# ==================================================================
|
25 |
+
#
|
26 |
+
|
27 |
+
col_method, col_config = st.columns(2)
|
28 |
+
|
29 |
+
# --- Method ---
|
30 |
+
# with col_model.expander('**MODEL**', expanded=True):
|
31 |
+
with col_method.container(border=True):
|
32 |
+
# extraction_method = st.selectbox(
|
33 |
+
# 'Extraction method:',
|
34 |
+
# ('interval', 'video2slides', 'pyscenedetect'),
|
35 |
+
# index=0
|
36 |
+
# )
|
37 |
+
extraction_method = st.radio(
|
38 |
+
'Extraction method:',
|
39 |
+
('interval', 'video2slides', 'pyscenedetect'),
|
40 |
+
index=0,
|
41 |
+
horizontal=True,
|
42 |
+
)
|
43 |
+
|
44 |
+
# col_config_frame_interval, col_config_ocr_lang = st.columns(2)
|
45 |
+
# frame_interval = col_config_frame_interval.slider('Extract frames every `X` seconds:', min_value=1, max_value=60, value=5, step=1)
|
46 |
+
# ocr_lang = col_config_ocr_lang.text_input('OCR Language(s) (e.g. `rus`, `rus+eng`):', value='rus')
|
47 |
+
ocr_lang = st.text_input('OCR Language(s) (e.g. `rus`, `rus+eng`):', value='rus')
|
48 |
+
|
49 |
+
# --- Configuration ---
|
50 |
+
with col_config.expander(f'**`{extraction_method}` METHOD CONFIG**', expanded=True):
|
51 |
+
match extraction_method:
|
52 |
+
case 'interval':
|
53 |
+
extraction_interval = st.number_input(
|
54 |
+
'Frames extraction interval:',
|
55 |
+
min_value=0, max_value=25, step=1, format='%i', value=5,
|
56 |
+
help='Extract frames every `x` seconds'
|
57 |
+
)
|
58 |
+
case 'video2slides':
|
59 |
+
print('video2slides')
|
60 |
+
case 'pyscenedetect':
|
61 |
+
extraction_threshold = st.number_input(
|
62 |
+
'Frames extraction threshold:',
|
63 |
+
min_value=0.1, max_value=10.0, step=0.1, format='%f', value=2.0,
|
64 |
+
)
|
65 |
+
|
66 |
+
|
67 |
+
# --- Semantic Segmentation Placeholder ---
|
68 |
+
# st.markdown("---")
|
69 |
+
# --- Tesseract Configuration (Optional but recommended) ---
|
70 |
+
# Uncomment and set the path if tesseract is not in your PATH
|
71 |
+
# pytesseract.pytesseract.tesseract_cmd = r'/path/to/your/tesseract' # Example: '/usr/bin/tesseract' or 'C:\Program Files\Tesseract-OCR\tesseract.exe'
|
72 |
+
|
73 |
+
|
74 |
+
|
75 |
+
|
76 |
+
# # --- Frame Extraction and OCR ---
|
77 |
+
# st.subheader('OCR')
|
78 |
+
|
79 |
+
if st.button('Extract Frames'):
|
80 |
+
# st.session_state['ocr_text'] = None # clear previous results
|
81 |
+
st.session_state['frames_paths'] = []
|
82 |
+
# all_ocr_results = []
|
83 |
+
|
84 |
+
col_info, col_complete, col_next = st.columns(3)
|
85 |
+
|
86 |
+
match extraction_method:
|
87 |
+
case 'interval':
|
88 |
+
with st.spinner(f'Extracting frames every {extraction_interval} seconds (using interval method)..'):
|
89 |
+
start_time = time.time()
|
90 |
+
frames_dir, frame_paths = extract_frames_interval(video_path, 'frames_pyscenedetect', interval_sec=extraction_interval)
|
91 |
+
extract_time = time.time() - start_time
|
92 |
+
if frames_dir and frame_paths:
|
93 |
+
st.session_state['frames_dir'] = frames_dir
|
94 |
+
st.session_state['frames_paths'] = frame_paths # store paths
|
95 |
+
col_info.success(f'Extracted {len(frame_paths)} frames in {extract_time:.2f}s.')
|
96 |
+
else:
|
97 |
+
col_info.error('Failed to extract frames')
|
98 |
+
st.stop()
|
99 |
+
case 'video2slides':
|
100 |
+
pass
|
101 |
+
case 'pyscenedetect':
|
102 |
+
with st.spinner(f'Extracting frames with `threshold={extraction_threshold}` (using pyscenedetect method)..'):
|
103 |
+
start_time = time.time()
|
104 |
+
frames_dir, frame_paths = extract_frames_pyscenedetect(video_path, 'frames_pyscenedetect', threshold=extraction_threshold)
|
105 |
+
extract_time = time.time() - start_time
|
106 |
+
if frames_dir and frame_paths:
|
107 |
+
st.session_state['frames_dir'] = frames_dir
|
108 |
+
st.session_state['frames_paths'] = frame_paths # store paths
|
109 |
+
col_info.success(f'Extracted {len(frame_paths)} frames in {extract_time:.2f}s.')
|
110 |
+
else:
|
111 |
+
col_info.error('Failed to extract frames')
|
112 |
+
st.stop()
|
113 |
+
|
114 |
+
|
115 |
+
if st.session_state['frames_paths']:
|
116 |
+
total_frames = len(st.session_state['frames_paths'])
|
117 |
+
col_info.write(f'Performing OCR on {total_frames} frames..')
|
118 |
+
ocr_progress = st.progress(0)
|
119 |
+
start_ocr_time = time.time()
|
120 |
+
|
121 |
+
extracted_texts = []
|
122 |
+
processed_count = 0
|
123 |
+
|
124 |
+
# Use columns to display some example frames and OCR
|
125 |
+
max_display_frames = 6
|
126 |
+
display_cols = st.columns(min(max_display_frames, total_frames) if total_frames > 0 else 1)
|
127 |
+
display_idx = 0
|
128 |
+
|
129 |
+
# Process frames in batches or one by one
|
130 |
+
for i, frame_path in enumerate(st.session_state['frames_paths']):
|
131 |
+
try:
|
132 |
+
img = Image.open(frame_path)
|
133 |
+
# --- Potential Preprocessing/Filtering ---
|
134 |
+
# Add logic here if needed:
|
135 |
+
# - Detect if frame likely contains text (e.g., check contrast, edges)
|
136 |
+
# - If segmentation was implemented, crop to slide regions here
|
137 |
+
# --- Perform OCR ---
|
138 |
+
text = pytesseract.image_to_string(img, lang=ocr_lang)
|
139 |
+
# --- Basic Text Cleaning/Filtering ---
|
140 |
+
cleaned_text = text.strip()
|
141 |
+
if cleaned_text and len(cleaned_text) > 10: # filter very short/noisy results
|
142 |
+
# Extract timestamp from filename (assuming format frame_XXXXXX.png)
|
143 |
+
try:
|
144 |
+
secs = int(os.path.basename(frame_path).split('_')[1].split('.')[0])
|
145 |
+
timestamp = time.strftime('%H:%M:%S', time.gmtime(secs))
|
146 |
+
extracted_texts.append({'timestamp': timestamp, 'text': cleaned_text})
|
147 |
+
except:
|
148 |
+
extracted_texts.append({'timestamp': 'N/A', 'text': cleaned_text}) # fallback if filename parse fails
|
149 |
+
|
150 |
+
|
151 |
+
# Display some examples
|
152 |
+
if display_idx < max_display_frames and display_idx < len(display_cols):
|
153 |
+
with display_cols[display_idx]:
|
154 |
+
st.image(img, caption=f'Frame (t={timestamp})', use_container_width=True)
|
155 |
+
st.text(f'OCR:\n{cleaned_text[:100]}..') # show snippet
|
156 |
+
display_idx += 1
|
157 |
+
|
158 |
+
|
159 |
+
processed_count += 1
|
160 |
+
ocr_progress.progress(processed_count / total_frames)
|
161 |
+
|
162 |
+
except Exception as ocr_err:
|
163 |
+
col_info.warning(f'Could not perform OCR on {os.path.basename(frame_path)}: {ocr_err}')
|
164 |
+
processed_count += 1 # still count as processed
|
165 |
+
ocr_progress.progress(processed_count / total_frames)
|
166 |
+
|
167 |
+
ocr_time = time.time() - start_ocr_time
|
168 |
+
col_complete.success(f'OCR processing finished in {ocr_time:.2f}s.')
|
169 |
+
|
170 |
+
# --- Aggregate and Deduplicate OCR Text ---
|
171 |
+
# Simple approach: Combine unique text blocks
|
172 |
+
final_ocr_text = ""
|
173 |
+
seen_texts = set()
|
174 |
+
last_text = ""
|
175 |
+
min_similarity_threshold = 0.8 # requires a library like `thefuzz` or similar for proper check
|
176 |
+
# basic check: avoid exact consecutive duplicates
|
177 |
+
|
178 |
+
for item in extracted_texts:
|
179 |
+
current_text_block = item['text'].strip()
|
180 |
+
|
181 |
+
# Basic check: Only add if significantly different from the last block
|
182 |
+
# A more robust check would involve sequence matching or fuzzy matching
|
183 |
+
is_duplicate = False
|
184 |
+
if last_text:
|
185 |
+
# Simple check: exact match or near-exact length/content start?
|
186 |
+
if (current_text_block == last_text or
|
187 |
+
(abs(len(current_text_block) - len(last_text)) < 10 and
|
188 |
+
current_text_block.startswith(last_text[:20]))
|
189 |
+
):
|
190 |
+
is_duplicate = True # likely a duplicate from consecutive frames
|
191 |
+
|
192 |
+
if current_text_block and not is_duplicate: # only add non-empty, non-duplicate text
|
193 |
+
final_ocr_text += f"\n\n--- Text from frame around {item['timestamp']} ---\n"
|
194 |
+
final_ocr_text += current_text_block
|
195 |
+
last_text = current_text_block # update last text added
|
196 |
+
|
197 |
+
st.session_state['ocr_text'] = final_ocr_text.strip()
|
198 |
+
|
199 |
+
if st.session_state['ocr_text']:
|
200 |
+
col_complete.info('OCR processing complete.')
|
201 |
+
col_next.page_link('ui_summarize.py', label='Next Step: **📝 Summarize**', icon='➡️')
|
202 |
+
else:
|
203 |
+
col_complete.warning('No significant text found via OCR')
|
204 |
+
|
205 |
+
|
206 |
+
# --- Display OCR Results ---
|
207 |
+
st.subheader('Aggregated OCR Text')
|
208 |
+
if 'ocr_text' in st.session_state and st.session_state['ocr_text']:
|
209 |
+
st.text_area("Extracted Text from Frames", st.session_state['ocr_text'], height=400)
|
210 |
+
else:
|
211 |
+
st.info('OCR has not been run or no text was detected')
|
212 |
+
|
213 |
+
|
214 |
+
# st.divider()
|
215 |
+
|
216 |
+
# st.subheader('Semantic Segmentation')
|
utils.py
ADDED
@@ -0,0 +1,436 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import tempfile
|
3 |
+
# import ffmpeg
|
4 |
+
from moviepy.video.io.VideoFileClip import VideoFileClip
|
5 |
+
import cv2
|
6 |
+
import uuid
|
7 |
+
import tomllib
|
8 |
+
from pathlib import Path
|
9 |
+
|
10 |
+
import streamlit as st
|
11 |
+
|
12 |
+
import numpy as np
|
13 |
+
from io import BytesIO
|
14 |
+
|
15 |
+
from pydub import AudioSegment
|
16 |
+
from pydub.silence import detect_leading_silence
|
17 |
+
import librosa
|
18 |
+
|
19 |
+
import librosa.display as lbd
|
20 |
+
import matplotlib.pyplot as plt
|
21 |
+
|
22 |
+
|
23 |
+
TEMP_DIR = tempfile.mkdtemp()
|
24 |
+
|
25 |
+
|
26 |
+
CONFIG_FILE = 'config.toml'
|
27 |
+
|
28 |
+
|
29 |
+
def load_config():
|
30 |
+
"""Loads configuration from config.toml"""
|
31 |
+
try:
|
32 |
+
with open(CONFIG_FILE, 'rb') as f:
|
33 |
+
return tomllib.load(f)
|
34 |
+
except FileNotFoundError:
|
35 |
+
print(f"Error: {CONFIG_FILE} not found. Using default settings.")
|
36 |
+
# Provide default fallback config if needed
|
37 |
+
return {
|
38 |
+
"paths": {"output_dir": "output", "temp_dir": "temp_processing"},
|
39 |
+
"models": {"whisper_model": "base.en", "ocr_languages": ["en"], "summarization_model": "google/pegasus-xsum"},
|
40 |
+
"settings": {"frame_extraction_interval_seconds": 10, "max_summary_length": 500, "min_summary_length": 100}
|
41 |
+
}
|
42 |
+
except Exception as e:
|
43 |
+
print(f"Error loading config: {e}")
|
44 |
+
raise # Re-raise after printing
|
45 |
+
|
46 |
+
|
47 |
+
CONFIG = load_config()
|
48 |
+
|
49 |
+
|
50 |
+
def ensure_dir(directory_path):
|
51 |
+
"""Creates a directory if it doesn't exist."""
|
52 |
+
Path(directory_path).mkdir(parents=True, exist_ok=True)
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
|
57 |
+
def save_uploaded_file(uploaded_file):
|
58 |
+
"""Saves an uploaded file to a temporary directory."""
|
59 |
+
if uploaded_file is not None:
|
60 |
+
# Generate a unique sub-directory for this upload
|
61 |
+
session_id = get_session_id() # simple way to group files per session/upload
|
62 |
+
upload_dir = os.path.join(TEMP_DIR, session_id)
|
63 |
+
os.makedirs(upload_dir, exist_ok=True)
|
64 |
+
|
65 |
+
file_path = os.path.join(upload_dir, uploaded_file.name)
|
66 |
+
with open(file_path, 'wb') as f:
|
67 |
+
f.write(uploaded_file.getbuffer())
|
68 |
+
print(f'File saved to: {file_path}') # debugging
|
69 |
+
return file_path
|
70 |
+
return None
|
71 |
+
|
72 |
+
|
73 |
+
def get_session_id():
|
74 |
+
"""Generates or retrieves a unique session ID."""
|
75 |
+
if 'session_id' not in st.session_state:
|
76 |
+
st.session_state['session_id'] = str(uuid.uuid4())[:8]
|
77 |
+
return st.session_state['session_id']
|
78 |
+
|
79 |
+
|
80 |
+
def get_session_dir():
|
81 |
+
"""Gets the temporary directory path for the current session."""
|
82 |
+
session_id = get_session_id()
|
83 |
+
return os.path.join(TEMP_DIR, session_id)
|
84 |
+
|
85 |
+
|
86 |
+
def get_temp_dir():
|
87 |
+
"""Creates and returns the path to a temporary directory for processing."""
|
88 |
+
temp_dir = Path(CONFIG['paths']['temp_dir'])
|
89 |
+
ensure_dir(temp_dir)
|
90 |
+
# Consider using unique subdirs per run if needed
|
91 |
+
# processing_subdir = tempfile.mkdtemp(dir=temp_dir)
|
92 |
+
# return processing_subdir
|
93 |
+
return str(temp_dir) # Return as string for wider compatibility
|
94 |
+
|
95 |
+
|
96 |
+
def extract_audio(video_path, audio_format="wav"):
|
97 |
+
"""Extracts audio from video using moviepy."""
|
98 |
+
try:
|
99 |
+
session_dir = os.path.dirname(video_path) # Assumes video is in session dir
|
100 |
+
base_name = os.path.splitext(os.path.basename(video_path))[0]
|
101 |
+
audio_filename = f"{base_name}_audio.{audio_format}"
|
102 |
+
audio_path = os.path.join(session_dir, audio_filename)
|
103 |
+
|
104 |
+
if os.path.exists(audio_path):
|
105 |
+
print(f"Audio file already exists: {audio_path}")
|
106 |
+
return audio_path
|
107 |
+
|
108 |
+
print(f"Extracting audio from {video_path} to {audio_path}...")
|
109 |
+
video_clip = VideoFileClip(video_path)
|
110 |
+
audio_clip = video_clip.audio
|
111 |
+
if audio_clip is None:
|
112 |
+
print("No audio track found in the video.")
|
113 |
+
video_clip.close()
|
114 |
+
return None
|
115 |
+
audio_clip.write_audiofile(audio_path, codec='pcm_s16le' if audio_format == 'wav' else 'mp3') # WAV is often better for STT
|
116 |
+
audio_clip.close()
|
117 |
+
video_clip.close()
|
118 |
+
print("Audio extraction complete.")
|
119 |
+
return audio_path
|
120 |
+
except Exception as e:
|
121 |
+
print(f"Error extracting audio: {e}")
|
122 |
+
# Clean up potentially corrupted file
|
123 |
+
if 'audio_clip' in locals() and audio_clip:
|
124 |
+
audio_clip.close()
|
125 |
+
if 'video_clip' in locals() and video_clip:
|
126 |
+
video_clip.close()
|
127 |
+
# Attempt to remove partial file if creation failed mid-way
|
128 |
+
if os.path.exists(audio_path):
|
129 |
+
try:
|
130 |
+
os.remove(audio_path)
|
131 |
+
except OSError as rm_e:
|
132 |
+
print(f"Could not remove partial audio file {audio_path}: {rm_e}")
|
133 |
+
return None
|
134 |
+
|
135 |
+
|
136 |
+
from scenedetect import open_video, SceneManager
|
137 |
+
from scenedetect.detectors import ContentDetector
|
138 |
+
|
139 |
+
|
140 |
+
def extract_frames_pyscenedetect(video_path, output_dir, threshold=2.0):
|
141 |
+
# session_dir = os.path.dirname(video_path)
|
142 |
+
# frames_dir = os.path.join(session_dir, 'frames_pyscenedetect')
|
143 |
+
# os.makedirs(frames_dir, exist_ok=True)
|
144 |
+
os.makedirs(output_dir, exist_ok=True) # ensure the output dir exists
|
145 |
+
|
146 |
+
# Init video- and scene- managers
|
147 |
+
# video_manager = VideoManager([video_path])
|
148 |
+
video = open_video(video_path)
|
149 |
+
scene_manager = SceneManager()
|
150 |
+
|
151 |
+
scene_manager.add_detector(ContentDetector(threshold=threshold))
|
152 |
+
|
153 |
+
# Start analysis
|
154 |
+
# video_manager.set_downscale_factor()
|
155 |
+
# video_manager.start()
|
156 |
+
# scene_manager.detect_scenes(frame_source=video_manager)
|
157 |
+
scene_manager.detect_scenes(video)
|
158 |
+
print(scene_manager.get_scene_list())
|
159 |
+
|
160 |
+
# Get the scene list
|
161 |
+
scene_list = scene_manager.get_scene_list()
|
162 |
+
print(f'Обнаружено {len(scene_list)} смен сцен.')
|
163 |
+
|
164 |
+
# Save the scenes switch frames
|
165 |
+
cap = cv2.VideoCapture(video_path)
|
166 |
+
if not cap.isOpened():
|
167 |
+
print(f'Error: Could not open video file {video_path}')
|
168 |
+
return None
|
169 |
+
|
170 |
+
extracted_frame_paths = []
|
171 |
+
|
172 |
+
for i, (start_time, _) in enumerate(scene_list):
|
173 |
+
frame_num = start_time.get_frames()
|
174 |
+
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
|
175 |
+
success, frame = cap.read()
|
176 |
+
if success:
|
177 |
+
timestamp_ms = cap.get(cv2.CAP_PROP_POS_MSEC)
|
178 |
+
# frame_filename = f'scene_{i + 1:03d}.jpg'
|
179 |
+
frame_filename = f'frame_{int(timestamp_ms / 1000):06d}.png' # naming by seconds
|
180 |
+
frame_path = os.path.join(output_dir, frame_filename)
|
181 |
+
cv2.imwrite(frame_path, frame)
|
182 |
+
print(f'[*] Сохранён кадр {frame_num} в {frame_path}')
|
183 |
+
extracted_frame_paths.append(frame_path)
|
184 |
+
else:
|
185 |
+
print(f'[!] Ошибка при чтении кадра {frame_num}')
|
186 |
+
|
187 |
+
cap.release()
|
188 |
+
return output_dir, extracted_frame_paths
|
189 |
+
print(f'Extracted {len(extracted_frame_paths)} frames to {output_dir}.')
|
190 |
+
return output_dir, extracted_frame_paths
|
191 |
+
|
192 |
+
|
193 |
+
def extract_frames_interval(video_path, output_dir, interval_sec=5):
|
194 |
+
'''Extracts frames from video at specified intervals using OpenCV.'''
|
195 |
+
try:
|
196 |
+
# session_dir = os.path.dirname(video_path)
|
197 |
+
# frames_dir = os.path.join(session_dir, 'frames_interval')
|
198 |
+
# os.makedirs(frames_dir, exist_ok=True)
|
199 |
+
os.makedirs(output_dir, exist_ok=True) # ensure the output dir exists
|
200 |
+
|
201 |
+
print(f'Extracting frames from {video_path} every {interval_sec}s..')
|
202 |
+
cap = cv2.VideoCapture(video_path)
|
203 |
+
if not cap.isOpened():
|
204 |
+
print(f'Error: Could not open video file {video_path}')
|
205 |
+
return None
|
206 |
+
|
207 |
+
fps = cap.get(cv2.CAP_PROP_FPS)
|
208 |
+
if fps == 0:
|
209 |
+
print('Warning: Could not get FPS, defaulting to 30.')
|
210 |
+
fps = 30 # provide a default if FPS is not available
|
211 |
+
|
212 |
+
frame_interval = int(fps * interval_sec)
|
213 |
+
frame_count = 0
|
214 |
+
extracted_frame_paths = []
|
215 |
+
|
216 |
+
def extract_frame():
|
217 |
+
timestamp_ms = cap.get(cv2.CAP_PROP_POS_MSEC)
|
218 |
+
frame_filename = f'frame_{int(timestamp_ms / 1000):06d}.png' # naming by seconds
|
219 |
+
frame_path = os.path.join(output_dir, frame_filename)
|
220 |
+
cv2.imwrite(frame_path, frame)
|
221 |
+
extracted_frame_paths.append(frame_path)
|
222 |
+
|
223 |
+
success = True
|
224 |
+
while success:
|
225 |
+
if frame_count % frame_interval == 0:
|
226 |
+
success, frame = cap.read()
|
227 |
+
if success:
|
228 |
+
extract_frame()
|
229 |
+
else:
|
230 |
+
# Skip frames efficiently without decoding
|
231 |
+
for _ in range(frame_interval - 1):
|
232 |
+
success = cap.grab()
|
233 |
+
if not success:
|
234 |
+
break
|
235 |
+
frame_count += 1
|
236 |
+
# Now read the desired frame if grab was successful
|
237 |
+
if success:
|
238 |
+
success, frame = cap.retrieve()
|
239 |
+
if success:
|
240 |
+
extract_frame()
|
241 |
+
else:
|
242 |
+
# Handle case where retrieve fails after grab
|
243 |
+
print(f'Warning: Failed to retrieve frame after grab at frame count {frame_count}')
|
244 |
+
|
245 |
+
frame_count += 1
|
246 |
+
|
247 |
+
cap.release()
|
248 |
+
print(f'Extracted {len(extracted_frame_paths)} frames to {output_dir}.')
|
249 |
+
return output_dir, extracted_frame_paths
|
250 |
+
except Exception as e:
|
251 |
+
print(f'Error extracting frames: {e}')
|
252 |
+
if 'cap' in locals() and cap.isOpened():
|
253 |
+
cap.release()
|
254 |
+
return None, []
|
255 |
+
|
256 |
+
|
257 |
+
# --- Add other potential helpers: yt-dlp download, file cleanup etc. ---
|
258 |
+
def download_youtube(url, output_dir):
|
259 |
+
"""Downloads YouTube video using yt-dlp."""
|
260 |
+
import yt_dlp
|
261 |
+
ydl_opts = {
|
262 |
+
'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
|
263 |
+
'outtmpl': os.path.join(output_dir, '%(title)s.%(ext)s'),
|
264 |
+
'noplaylist': True, # download only single video if URL is part of playlist
|
265 |
+
'progress_hooks': [lambda d: print(d['status'])] # basic progress
|
266 |
+
}
|
267 |
+
try:
|
268 |
+
print(f'Attempting to download YouTube video: {url}')
|
269 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
270 |
+
info = ydl.extract_info(url, download=True)
|
271 |
+
# Try to get the downloaded filename
|
272 |
+
filename = ydl.prepare_filename(info)
|
273 |
+
print(f"YouTube video downloaded to: {filename}")
|
274 |
+
return filename
|
275 |
+
except Exception as e:
|
276 |
+
print(f"Error downloading YouTube video: {e}")
|
277 |
+
return None
|
278 |
+
|
279 |
+
|
280 |
+
def cleanup_session_files(session_id):
|
281 |
+
"""Removes the temporary directory for a given session."""
|
282 |
+
session_dir = os.path.join(TEMP_DIR, session_id)
|
283 |
+
if os.path.exists(session_dir):
|
284 |
+
import shutil
|
285 |
+
try:
|
286 |
+
shutil.rmtree(session_dir)
|
287 |
+
print(f"Cleaned up temporary files for session: {session_id}")
|
288 |
+
except Exception as e:
|
289 |
+
print(f"Error cleaning up session files {session_dir}: {e}")
|
290 |
+
|
291 |
+
|
292 |
+
|
293 |
+
|
294 |
+
###
|
295 |
+
###=== Audio Loading and Processing
|
296 |
+
###
|
297 |
+
|
298 |
+
|
299 |
+
SAMPLE_RATE = 22050
|
300 |
+
DURATION = 5
|
301 |
+
|
302 |
+
n_mfcc = 13 # number of MFCCs to extract from each sample
|
303 |
+
n_mels = 128
|
304 |
+
|
305 |
+
n_fft = 2048
|
306 |
+
hop_length = 512
|
307 |
+
|
308 |
+
delta_width = 9 # MFCC Delta parameter
|
309 |
+
|
310 |
+
|
311 |
+
def trim_silence(sound, s_thresh=-28.0):
|
312 |
+
'''Trims silent chunks from beginning and end of the sound'''
|
313 |
+
duration = len(sound)
|
314 |
+
|
315 |
+
start_trim = detect_leading_silence(sound, s_thresh)
|
316 |
+
end_trim = detect_leading_silence(sound.reverse(), s_thresh)
|
317 |
+
|
318 |
+
start = start_trim if start_trim != duration else None
|
319 |
+
end = duration - end_trim if end_trim != duration else None
|
320 |
+
|
321 |
+
return sound[start:end]
|
322 |
+
|
323 |
+
|
324 |
+
def normalize_volume(sound, target_dBFS=-20.0):
|
325 |
+
'''Normalizes sound and shifts to specified loudness'''
|
326 |
+
sound = sound.normalize()
|
327 |
+
difference = target_dBFS - sound.dBFS
|
328 |
+
return sound.apply_gain(difference)
|
329 |
+
|
330 |
+
|
331 |
+
def proc_raw_audio(audio_data, from_start=0, duration=None, before_end=0):
|
332 |
+
'''Processes raw audio data and return wav and numpy arrays'''
|
333 |
+
# Instanciate pydub AudioSegment object from raw audio
|
334 |
+
audioObj = AudioSegment.from_file(BytesIO(audio_data))
|
335 |
+
|
336 |
+
# Convert to mono mode with the desired sample rate
|
337 |
+
audioObj = audioObj.set_frame_rate(SAMPLE_RATE).set_channels(1)
|
338 |
+
# Normalize audio volume
|
339 |
+
audioObj = normalize_volume(audioObj)
|
340 |
+
# Trim by removing silence from beginning and end of the sound
|
341 |
+
audioObj = trim_silence(audioObj)
|
342 |
+
|
343 |
+
# Cut to the desired duration
|
344 |
+
start = from_start * 1000
|
345 |
+
if duration:
|
346 |
+
end = start + duration * 1000
|
347 |
+
else:
|
348 |
+
end = len(audioObj) - before_end * 1000
|
349 |
+
audioObj = audioObj[start:end]
|
350 |
+
|
351 |
+
# Convert AudioSegment to wav format instance
|
352 |
+
buf = BytesIO()
|
353 |
+
audioObj.export(buf, format='wav')
|
354 |
+
audio_wav = buf.getvalue()
|
355 |
+
|
356 |
+
# Convert the AudioSegment to signal in form of numpy.array
|
357 |
+
arr = audioObj.get_array_of_samples()
|
358 |
+
audio_np = np.array(arr, dtype='float')
|
359 |
+
|
360 |
+
# Normalize if specified
|
361 |
+
# if normalized:
|
362 |
+
# audio_np = np.array(arr) / np.iinfo(arr.typecode).max
|
363 |
+
# y /= np.linalg.norm(y)
|
364 |
+
# return y, sample_rate
|
365 |
+
|
366 |
+
return audio_wav, audio_np
|
367 |
+
|
368 |
+
|
369 |
+
###==============================================
|
370 |
+
|
371 |
+
|
372 |
+
def obtain_features(y, sr=22050, duration=5, delta_width=9):
|
373 |
+
'''Extracts sound features from given signal and returns them as a numpy array'''
|
374 |
+
# --- MFCC (returns M: np.ndarray [shape=(n_mfcc, t)])
|
375 |
+
mfcc = librosa.feature.mfcc(y, sr,
|
376 |
+
n_mfcc=n_mfcc, n_mels=n_mels,
|
377 |
+
n_fft=n_fft, hop_length=hop_length)
|
378 |
+
|
379 |
+
return mfcc
|
380 |
+
|
381 |
+
|
382 |
+
def create_features_array(mfcc):#, mfcc_delta1, mfcc_delta2, spectr_c, spectr_r):
|
383 |
+
'''Creates wholistic numpy array of means and variances out of given features'''
|
384 |
+
make_meanvar = lambda mean, var: [item for mv in zip(mean, var) for item in mv]
|
385 |
+
|
386 |
+
mean_var_ops = [
|
387 |
+
(mfcc.mean(axis=1), mfcc.var(axis=1))
|
388 |
+
]
|
389 |
+
|
390 |
+
mfcc_meanvars = sum([make_meanvar(mean, var)
|
391 |
+
for mean, var in mean_var_ops], [])
|
392 |
+
|
393 |
+
# features_array = mfcc_meanvars + spectr_meanvars
|
394 |
+
features_array = [mfcc_meanvars]
|
395 |
+
|
396 |
+
return features_array
|
397 |
+
|
398 |
+
# def get_features(y, sr=22050, duration=5, delta_width=9):
|
399 |
+
# '''Returns numpy array of sound features obtained from signal'''
|
400 |
+
# return create_features_array(*obtain_features(y, sr, duration, delta_width))
|
401 |
+
|
402 |
+
|
403 |
+
def get_features(y, duration=5, sr=SAMPLE_RATE):
|
404 |
+
'''Returns numpy array of sound features obtained from signal'''
|
405 |
+
|
406 |
+
fig, axes = plt.subplots(1, 2, figsize=(24, 2))
|
407 |
+
|
408 |
+
# WAVE PLOT
|
409 |
+
axes[0].set_title(f'Wave Plot for audio sample at {sr} hz')
|
410 |
+
axes[0].set_facecolor('#B4E8CF')
|
411 |
+
lbd.waveshow(y, sr=sr, color='#4300FF', ax=axes[0])
|
412 |
+
|
413 |
+
# MELSPEC
|
414 |
+
melspec = librosa.feature.melspectrogram(y=y, sr=sr)
|
415 |
+
melspec = librosa.power_to_db(np.abs(melspec), ref=np.max)
|
416 |
+
axes[1].set_title(f'Mel Spectogram | shape: {melspec.shape}')
|
417 |
+
lbd.specshow(melspec, cmap='viridis', y_axis='mel', x_axis='time', ax=axes[1])
|
418 |
+
|
419 |
+
st.pyplot(fig)
|
420 |
+
|
421 |
+
pad_signal = lambda s, v: np.pad(
|
422 |
+
s,
|
423 |
+
[(0, 0), (0, max(0, 216 - s.shape[1]))],
|
424 |
+
constant_values=v
|
425 |
+
)
|
426 |
+
|
427 |
+
# Prepare melspec for use
|
428 |
+
melspec = pad_signal(melspec, melspec.min())
|
429 |
+
melspec = melspec.reshape(1, *melspec.shape)
|
430 |
+
|
431 |
+
# MFCC
|
432 |
+
# mfcc = create_features_array(obtain_features(y, sr, duration, delta_width))
|
433 |
+
# mfcc = np.array(mfcc).reshape(1, -1)
|
434 |
+
|
435 |
+
return melspec
|
436 |
+
# return mfcc
|