macsunmood commited on
Commit
e4e56ea
·
1 Parent(s): d214c78
Files changed (13) hide show
  1. .streamlit/config.toml +11 -0
  2. README.md +3 -3
  3. app.py +33 -0
  4. requirements.txt +30 -0
  5. transcriber.py +68 -0
  6. ui_create_summary.py +0 -0
  7. ui_home.py +8 -0
  8. ui_result.py +0 -0
  9. ui_summarize.py +338 -0
  10. ui_transcribe.py +284 -0
  11. ui_upload.py +320 -0
  12. ui_video.py +216 -0
  13. utils.py +436 -0
.streamlit/config.toml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [client]
2
+ showErrorDetails = true
3
+
4
+ [server]
5
+ headless = false
6
+ enableCORS = false
7
+ enableXsrfProtection = false
8
+ maxUploadSize = 5000
9
+
10
+ [theme]
11
+ primaryColor = "#FA8E00"
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
  title: Conspectum
3
- emoji: 👀
4
- colorFrom: pink
5
- colorTo: pink
6
  sdk: streamlit
7
  sdk_version: 1.44.1
8
  app_file: app.py
 
1
  ---
2
  title: Conspectum
3
+ emoji: 📚
4
+ colorFrom: yellow
5
+ colorTo: yellow
6
  sdk: streamlit
7
  sdk_version: 1.44.1
8
  app_file: app.py
app.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ ### ICON BANK: 🗣️🎙️🎤🗨 📚📝🎞️👩‍🏫👨‍🏫💡📖 🗒️🔑💾
3
+
4
+
5
+ # Page config
6
+ st.set_page_config(
7
+ page_title='Conspectum: Video Lectures Summarization',
8
+ # page_icon='conspectum_logo.png',
9
+ page_icon='📚',
10
+ layout='wide',
11
+ menu_items={
12
+ 'Get help': 'https://edu.olymponline.ru/',
13
+ 'About': "# MIPT Master's :: Hackathon - Spring '25. Team 8 - Conspectum"
14
+ },
15
+ )
16
+
17
+ pg = st.navigation({
18
+ 'Home':
19
+ [
20
+ # ui_home := st.Page('ui_home.py', title='Welcome', icon='🏠'),
21
+ ui_upload := st.Page('ui_upload.py', title='Upload', icon='📥'),
22
+ ui_create_summary := st.Page('ui_create_summary.py', title='Create Summary', icon='✨')
23
+ ],
24
+ 'Pipeline Sandbox':
25
+ [
26
+ ui_transcribe := st.Page('ui_transcribe.py', title='Transcribe', icon='🎙️'),
27
+ ui_video := st.Page('ui_video.py', title='Analyse Video', icon='🖼️'),
28
+ ui_summarize := st.Page('ui_summarize.py', title='Summarize', icon='📝'),
29
+ ui_result := st.Page('ui_result.py', title='Result', icon='✔️')
30
+ ]
31
+ })
32
+
33
+ pg.run()
requirements.txt ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ streamlit_autorefresh
3
+ streamlit_extras
4
+ ffmpeg-python
5
+ ffmpegcv
6
+ moviepy
7
+
8
+ torch
9
+ torchvision
10
+ torchaudio
11
+ transformers
12
+
13
+ yt-dlp
14
+
15
+ openai-whisper
16
+ faster-whisper
17
+ SpeechRecognition
18
+ PyAudio
19
+ pydub
20
+ librosa
21
+
22
+ python-docx
23
+ pandas
24
+ matplotlib
25
+
26
+ pyperclip
27
+
28
+ scenedetect
29
+ easyocr
30
+ pytesseract
transcriber.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ from tempfile import NamedTemporaryFile
3
+
4
+
5
+ class Transcription:
6
+ def __init__(self, source):
7
+ self.source = source
8
+ # self.device = device
9
+ self.audios = []
10
+
11
+ # with NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
12
+ # tmp_file.write(file.getvalue())
13
+ # # self.audios.append(tmp_file.name)
14
+ # self.audios.append(tmp_file)
15
+
16
+ self.audios.append(source)
17
+
18
+ def transcribe(
19
+ self,
20
+ model
21
+ # whisper_model_option: str,
22
+ # translation: bool,
23
+ ):
24
+ # # Get the whisper model
25
+ # transcriber = whisper.load_model(whisper_model_option, device=self.device)
26
+
27
+ self.output = []
28
+
29
+ for idx, _ in enumerate(self.audios):
30
+ # identify language
31
+ audio = whisper.load_audio(self.audios[idx])
32
+ audio = whisper.pad_or_trim(audio)
33
+
34
+ # print(model.__dict__)
35
+ # n_mels = 128 if 'large' in model.name else 80
36
+ mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device)
37
+
38
+ _, probs = model.detect_language(mel)
39
+ language = max(probs, key=probs.get)
40
+
41
+ self.raw_output = model.transcribe(
42
+ self.audios[idx],
43
+ language=language,
44
+ verbose=True,
45
+ word_timestamps=True,
46
+ # fp16=(model.device == 'cuda') # use fp16 on GPU for speed/memory
47
+ )
48
+ # if(translation):
49
+ # self.translation = model.transcribe(
50
+ # self.audios[idx],
51
+ # language=language,
52
+ # verbose=True,
53
+ # word_timestamps=True,
54
+ # task='translate'
55
+ # )["text"]
56
+ # self.raw_output["translation"] = self.translation
57
+
58
+ self.segments = self.raw_output['segments']
59
+ for segment in self.raw_output['segments']:
60
+ del segment['tokens']
61
+
62
+ self.raw_output.update(
63
+ name=self.source[idx],#.name,
64
+ language=language
65
+ )
66
+
67
+ self.output.append(self.raw_output)
68
+ print(self.raw_output['segments'])
ui_create_summary.py ADDED
File without changes
ui_home.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+
4
+ st.title('📚 Conspectum: Video Lectures Summarization 📝')
5
+
6
+ st.markdown('''
7
+ Welcome to the Video Lecture Summarizer app!
8
+ ''')
ui_result.py ADDED
File without changes
ui_summarize.py ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from transformers import pipeline
4
+ import time
5
+ from docx import Document
6
+ from io import BytesIO
7
+ os.environ['STREAMLIT_SERVER_ENABLE_FILE_WATCHER'] = 'false'
8
+ import torch
9
+ from langchain_ollama.llms import OllamaLLM
10
+ # from utils import cleanup_session_files, get_session_id # for cleanup button
11
+
12
+
13
+ st.title("📝 Step 4: Lecture Notes Summarization & Structuring")
14
+
15
+ # Check if transcript and potentially OCR text are available
16
+ transcript_available = 'transcript' in st.session_state and st.session_state['transcript']
17
+ frames_available = 'frames_dir' in st.session_state and st.session_state['frames_dir']
18
+
19
+ if not transcript_available and not frames_available:
20
+ st.warning("No text content (Transcript or OCR) found. Please complete previous steps first.")
21
+ st.stop()
22
+
23
+ # st.info("This step combines the generated transcript and OCR text (if available) and creates a summary.")
24
+
25
+ # --- Combine Sources ---
26
+ st.subheader('Sources')
27
+ # combined_text = ""
28
+ source_info = []
29
+
30
+ if transcript_available:
31
+ st.success('✅ Transcript found')
32
+ # st.success(len(st.session_state.transcript.__dict__['output']))
33
+ # st.success(st.session_state.transcript.__dict__['output'][0]['text'])
34
+ # combined_text += '--- Transcript ---\n' + st.session_state.transcript['output'][0]['text'] + '\n\n'
35
+ # st.success(st.session_state.transcript.output[0]['text'])
36
+
37
+ transcript_text = st.session_state.transcript.output[0]['text']
38
+
39
+ # combined_text += '--- Transcript ---\n\n' + transcript_text + '\n\n'
40
+
41
+ # st.write(combined_text)
42
+
43
+ source_info.append('Transcript')
44
+ with st.expander('Show Transcript'):
45
+ st.text_area('Transcript', transcript_text, height=200, key='sum_transcript_disp')
46
+ else:
47
+ st.warning("Transcript not available.")
48
+
49
+ if frames_available:
50
+ st.success("✅ Extracted frames found")
51
+ # combined_text += "--- OCR results ---\n" + st.session_state['frames_dir']
52
+ source_info.append('Frames dir')
53
+ # with st.expander('Extracted frames directory'):
54
+ # st.text_area('Extracted frames directory', st.session_state['frames_dir'], height=200, key="sum_ocr_disp")
55
+ # st.text_area('Extracted frames directory', st.session_state['frames_dir'], height=200, key="sum_ocr_disp")
56
+ st.text_input('Extracted frames directory', st.session_state['frames_dir'])
57
+ else:
58
+ st.warning("OCR Text not available.")
59
+
60
+ # combined_text = combined_text.strip()
61
+
62
+ # if not combined_text:
63
+ # st.error("Combined text is empty. Cannot proceed.")
64
+ if not transcript_text:
65
+ st.error('Transcript text is empty. Cannot proceed.')
66
+ st.stop()
67
+
68
+
69
+ # --- Summarization Configuration ---
70
+
71
+ st.subheader('Summarization Settings')
72
+ # Consider different models/pipelines
73
+ summarizer_options = ['gemma3',
74
+ # 'gemma3:27b',
75
+ 'phi4',
76
+ 'mistral-small3.1',
77
+
78
+ # 'YandexGPT',
79
+ # 't5-base',
80
+ # 't5-large',
81
+ # 'facebook/mbart-large-50',
82
+
83
+ # 'facebook/bart-large-cnn',
84
+ # 'google/pegasus-xsum',
85
+ ]
86
+
87
+ # Note: Models like Pegasus/XSUM produce very short, abstractive summaries. BART/CNN is better for longer summaries. T5 is versatile.
88
+ selected_model = st.selectbox('Select Summarization Model:', summarizer_options, index=0)
89
+
90
+
91
+ # # Dynamic length based on input size (example logic)
92
+ # # input_length = len(combined_text.split())
93
+ # input_length = len(transcript_text.split()) # approx word count
94
+ # default_min = max(50, input_length // 10) # suggest min length ~10% of input
95
+ # default_max = max(150, input_length // 3) # suggest max length ~30% of input
96
+
97
+ # min_length = st.slider("Minimum Summary Length (tokens):", min_value=30, max_value=max(500, default_max + 100), value=default_min)
98
+ # max_length = st.slider("Maximum Summary Length (tokens):", min_value=50, max_value=max(1000, default_max + 200), value=default_max)
99
+
100
+ # if min_length >= max_length:
101
+ # st.warning("Minimum length should be less than maximum length.")
102
+ # # Adjust max_length automatically or prevent proceeding
103
+ # max_length = min_length + 50 # simple adjustment
104
+
105
+
106
+ # --- Generate Summary ---
107
+
108
+ def describe_video(model, frames_dir, describe_prompt):
109
+ images = []
110
+
111
+ for file in os.listdir(frames_dir):
112
+ images.append(os.path.join(frames_dir, file))
113
+
114
+ model_with_images = model.bind(images=images)
115
+
116
+ return model_with_images.invoke(describe_prompt)
117
+
118
+
119
+
120
+ with st.expander('**Prompt**', expanded=True):
121
+ # col_1, col_2 = st.columns(2)
122
+
123
+ describe_prompt = st.text_area(label='Промпт', height=300, value='''
124
+ Ты - ассистент, который создает конспекты лекций на основе предоставленного текста. Этот текст состоит из двух частей: 1. транскрибация аудио-дорожки видеолекции, 2. Изображение выделенных из видео ключевых кадров, с полезной информацией.
125
+
126
+ Сделай детальный конспект по тому, что описывается в видео. Для иллюстрации сравнений и сопоставлений используй markdown-таблицы. Ответ предоставь в формате markdown.
127
+ Придерживайся следующей структуры:
128
+
129
+ ## Содержание:
130
+ 1. [Название темы 1](###Название_темы_1) (таймкод начала)
131
+ 2. [Название темы 2](###Название_темы_2) (таймкод начала)
132
+ ...
133
+
134
+ ## Краткий конспект:
135
+
136
+ ### Название_темы_1
137
+ [Текст из транскрипции, относящийся к этой теме]
138
+
139
+ [Формулы, относящиеся к этой теме]
140
+
141
+ [Таблицы, относящиеся к этой теме]
142
+
143
+ ---
144
+
145
+ ### Название_темы_2
146
+ [Текст из транскрипции, относящийся к этой теме]
147
+
148
+ [Формулы, относящиеся к этой теме]
149
+
150
+ [Таблицы, относящиеся к этой теме]
151
+
152
+ ---
153
+
154
+
155
+ Здесь необходимо обратить внимание на следующие детали:
156
+ 1. правильно подобрать названия тем
157
+ 2. написать сжатый текст, оставляя (без сильного переформулирования) важную информацию.
158
+ 3. на основе предоставленного транскрибированного аудио и текста со слайдов попытайся составить таблицы в стиле markdown. Для этого проанализируй упомянутые ключевые термины и попытайся понять как их можно сравнить.
159
+ 4. Если ты понимаешь, что на некотором слайде должна быть ВАЖНАЯ формула (непосредственно относящаяся к теме занятия), которую плохо транскрибировали (или пропустили, хотя лектор её проговаривал/упоминал), то можешь привести её самостоятельно, если знаешь о ней. При этом подпиши под ней, что формулу написал ты.
160
+
161
+ Вот упомянутый транскрибированный текст:
162
+
163
+
164
+ ''')
165
+
166
+
167
+ _, col_button_summary, _ = st.columns([2, 1, 2])
168
+ if col_button_summary.button('Generate Summary', type='primary', use_container_width=True):
169
+ st.session_state['summary'] = None # clear previous summary
170
+
171
+ with st.spinner(f'Performing summarization with `{selected_model}` model..'):
172
+ st.session_state.summary = describe_video(model=OllamaLLM(model=selected_model),
173
+ frames_dir=st.session_state.frames_dir,
174
+ describe_prompt=describe_prompt + transcript_text)
175
+
176
+ # if combined_text:
177
+ # with st.spinner(f"Summarizing text using {selected_model}.. Может занять некоторое время (до x2)"):
178
+ # try:
179
+ # start_time = time.time()
180
+
181
+ # # Load the pipeline - specify device if possible
182
+ # device = 0 if torch.cuda.is_available() else -1 # device=0 for first GPU, -1 for CPU
183
+ # summarizer = pipeline("summarization", model=selected_model, device=device)
184
+
185
+ # # Handle potential long input (simplistic chunking if needed, better models handle longer inputs)
186
+ # # Basic check: Transformers often have input limits (e.g., 1024 tokens for BART).
187
+ # # A more robust solution involves chunking, summarizing chunks, and combining summaries.
188
+ # # For this example, we'll try summarizing directly, but add a warning.
189
+ # max_model_input_length = getattr(summarizer.model.config, 'max_position_embeddings', 1024) # get model's max length
190
+ # if len(summarizer.tokenizer.encode(combined_text)) > max_model_input_length:
191
+ # st.warning(f'Input text might be too long for {selected_model} (max ~{max_model_input_length} tokens).' +
192
+ # f'Consider using models designed for longer text or implementing chunking.')
193
+ # # Simple Truncation (Not Ideal):
194
+ # # truncated_text = summarizer.tokenizer.decode(summarizer.tokenizer.encode(combined_text, max_length=max_model_input_length, truncation=True))
195
+ # # summary_result = summarizer(truncated_text, max_length=max_length, min_length=min_length, do_sample=False)
196
+
197
+ # # Attempt summarization (may error if too long and not handled)
198
+ # summary_result = summarizer(combined_text, max_length=max_length, min_length=min_length, do_sample=False)
199
+ # st.session_state['summary'] = summary_result[0]['summary_text']
200
+
201
+ # end_time = time.time()
202
+ # st.success(f"Summary generated in {end_time - start_time:.2f} seconds.")
203
+
204
+ # except Exception as e:
205
+ # st.error(f"Error during summarization: {e}")
206
+ # st.error("This could be due to model loading issues, insufficient memory, or input text length.")
207
+ # if 'summarizer' in locals():
208
+ # del summarizer # try to free memory
209
+ # if device == 0: torch.cuda.empty_cache()
210
+
211
+ # else:
212
+ # st.error("No text available to summarize.")
213
+
214
+
215
+ # --- Display and Refine Summary ---
216
+ # st.subheader('Summary')
217
+
218
+ if 'summary' in st.session_state and st.session_state['summary']:
219
+ with st.container(height=600, border=True):
220
+ summary_container = st.empty()
221
+ edited_summary = st.session_state['summary']
222
+
223
+ # summary_container.markdown(st.session_state['summary'])
224
+ summary_container.markdown(edited_summary, unsafe_allow_html=True)
225
+
226
+ _, col_button_render, _ = st.columns([2, 1, 2])
227
+
228
+ # Use st.text_area for editing
229
+ edited_summary = st.text_area(
230
+ 'Edit the summary here (Markdown format supported):',
231
+ value=st.session_state['summary'],
232
+ height=400,
233
+ key='summary_edit_area'
234
+ )
235
+
236
+ if col_button_render.button('Render Markdown', type='secondary', use_container_width=True):
237
+ with st.spinner('Generating Markdown preview..'):
238
+ # st.markdown(edited_summary, unsafe_allow_html=True)
239
+ summary_container.markdown(edited_summary, unsafe_allow_html=True)
240
+ # st.session_state['summary'] = edited_summary # update summary
241
+ # else:
242
+ # st.markdown('', unsafe_allow_html=True)
243
+
244
+
245
+ # --- Export Options ---
246
+ st.subheader('📥 Export Notes (Download)')
247
+ col_export_md, col_export_docx, col_export_pdf = st.columns(3)
248
+
249
+ st.session_state['final_notes'] = edited_summary # store edited version
250
+ final_notes_md = st.session_state.get('final_notes', '')
251
+
252
+ # 1. Markdown (.md) export
253
+ col_export_md.download_button(
254
+ label="📥 Markdown (.md)",
255
+ data=final_notes_md,
256
+ file_name="lecture_notes.md",
257
+ mime="text/markdown",
258
+ use_container_width=True,
259
+ )
260
+
261
+ # 2. Word (.docx) export
262
+ try:
263
+ doc = Document()
264
+ doc.add_heading('Lecture Notes Summary', 0)
265
+ # Add basic Markdown conversion (very simple - assumes paragraphs)
266
+ # For full Markdown -> Docx, a library like 'pandoc' (external) or more complex parsing is needed.
267
+ paragraphs = final_notes_md.split('\n\n') # split by double newline
268
+ for para in paragraphs:
269
+ if para.strip(): # avoid empty paragraphs
270
+ # Basic handling for potential markdown emphasis (crude)
271
+ # A proper Markdown parser would be better here
272
+ cleaned_para = para.replace('*', '').replace('_', '').replace('#', '').strip()
273
+ doc.add_paragraph(cleaned_para)
274
+
275
+ # Save docx to a BytesIO buffer
276
+ buffer = BytesIO()
277
+ doc.save(buffer)
278
+ buffer.seek(0)
279
+
280
+ col_export_docx.download_button(
281
+ label='📥 Word (.docx)',
282
+ data=buffer,
283
+ file_name='lecture_notes.docx',
284
+ mime='application/vnd.openxmlformats-officedocument.wordprocessingml.document',
285
+ use_container_width=True
286
+ )
287
+ except Exception as docx_e:
288
+ st.error(f'Failed to generate .docx file: {docx_e}')
289
+
290
+ # 3. PDF (.pdf) export
291
+ try:
292
+ col_export_pdf.download_button(
293
+ label='📥 PDF (.pdf)',
294
+ data=buffer,
295
+ file_name="lecture_notes.pdf",
296
+ use_container_width=True,
297
+ # mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
298
+ disabled=True
299
+ )
300
+ except Exception as pdf_e:
301
+ st.error(f'Failed to generate .pdf file: {pdf_e}')
302
+
303
+
304
+
305
+ # 3. PDF Export (Requires extra libraries/setup - Placeholder)
306
+ # st.markdown("---")
307
+ # st.write("**PDF Export:**")
308
+ # try:
309
+ # from mdpdf.cli import mdpdf
310
+ # pdf_buffer = BytesIO()
311
+ # # This often requires command-line execution or careful API usage
312
+ # # Simplified placeholder - actual implementation may vary:
313
+ # # mdpdf(pdf_buffer, md=final_notes_md, ...) # Fictional direct API call
314
+ # st.info("PDF generation via libraries like mdpdf/WeasyPrint requires setup.")
315
+
316
+ # except ImportError:
317
+ # st.warning("`mdpdf` library not installed. PDF export unavailable.")
318
+ # except Exception as pdf_e:
319
+ # st.error(f"Failed to generate PDF (requires setup): {pdf_e}")
320
+
321
+
322
+ else:
323
+ st.info('Summary has not been generated or is empty.')
324
+
325
+
326
+ # --- Optional: Cleanup Button ---
327
+ # st.sidebar.markdown("---")
328
+ # if st.sidebar.button("End Session & Clean Up Files"):
329
+ # session_id = get_session_id()
330
+ # cleanup_session_files(session_id)
331
+ # # Clear relevant session state keys
332
+ # keys_to_clear = ['video_path', 'audio_path', 'frames_dir', 'transcript', 'summary', 'final_notes', 'extracted_frames', 'session_id']
333
+ # for key in keys_to_clear:
334
+ # if key in st.session_state:
335
+ # del st.session_state[key]
336
+ # st.success("Temporary files cleaned and session data cleared.")
337
+ # st.info("You can now start a new session from the 'Main' page.")
338
+ # # Consider navigating back to Main page or just showing message
ui_transcribe.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from streamlit_extras.stylable_container import stylable_container
3
+
4
+ import os
5
+ import time
6
+ import pathlib
7
+ from datetime import timedelta
8
+
9
+ os.environ['STREAMLIT_SERVER_ENABLE_FILE_WATCHER'] = 'false'
10
+ import whisper # openai-whisper
11
+ import torch # check for GPU availability
12
+
13
+ # from models.loader import load_model_sst
14
+
15
+ from transcriber import Transcription
16
+ import matplotlib.colors as mcolors
17
+
18
+
19
+ st.title('🎙️ Step 2: Speech-to-Text (ASR/STT)')
20
+
21
+ # Check if audio path exists from previous step
22
+ if 'audio_path' not in st.session_state or not st.session_state['audio_path'] or not os.path.exists(st.session_state['audio_path']):
23
+ st.warning('Audio file not found. Please go back to the "**📤 Upload**" page and process a video first.')
24
+ st.stop()
25
+
26
+ audio_path = st.session_state['audio_path']
27
+
28
+
29
+ # st.write(f'Audio file to process: `{os.path.basename(audio_path)}`')
30
+ st.write(f'Processing audio `{st.session_state.video_input_title}` from video input')
31
+
32
+ if 'start_time' not in st.session_state:
33
+ st.session_state.start_time = 0
34
+
35
+ # st.audio(audio_path)
36
+ # format='audio/wav',
37
+ st.audio(audio_path, start_time=st.session_state.start_time)
38
+
39
+ #
40
+ # ==================================================================
41
+ #
42
+
43
+ col_model, col_config = st.columns(2)
44
+
45
+ # --- Model ---
46
+ # with col_model.expander('**MODEL**', expanded=True):
47
+ with col_model.container(border=True):
48
+ model_option = st.selectbox(
49
+ 'SST Model:',
50
+ ['whisper', 'faster-whisper', 'distill-whisper', 'giga'],
51
+ index=0
52
+ )
53
+
54
+
55
+ # sst_model = load_model_sst(model_option)
56
+
57
+
58
+ # --- Configuration ---
59
+ with col_config.expander('**CONFIG**', expanded=True):
60
+ # Determine device
61
+ default_device = 'cuda' if torch.cuda.is_available() else 'cpu'
62
+ device = st.radio(
63
+ 'Compute device:',
64
+ ('cuda', 'cpu'),
65
+ index=0 if default_device == 'cuda' else 1,
66
+ horizontal=True,
67
+ disabled=not torch.cuda.is_available()
68
+ )
69
+
70
+ if device == 'cuda' and not torch.cuda.is_available():
71
+ st.warning('CUDA selected but not available, falling back to CPU')
72
+ device = 'cpu'
73
+
74
+ whisper_model_option = st.selectbox(
75
+ 'Whisper model type:',
76
+ ['tiny', 'base', 'small', 'medium', 'large', 'turbo'],
77
+ index=5
78
+ )
79
+
80
+ pauses = st.checkbox('pauses', value=False)
81
+
82
+ # from models.models_sst import Whisper
83
+ # Whisper.config()
84
+
85
+
86
+ ##
87
+ ## --- Transcription ---
88
+ ##
89
+
90
+ _, col_button_trancribe, _ = st.columns([2, 1, 2])
91
+ if col_button_trancribe.button('Transcribe', type='primary', use_container_width=True):
92
+ # if input_files:
93
+ # pass
94
+ # else:
95
+ # st.error("Please select a file")
96
+ st.session_state.transcript = None # clear previous transcript
97
+ col_info, col_complete, col_next = st.columns(3)
98
+
99
+ try:
100
+ with st.spinner(f'Loading Whisper `{whisper_model_option}` model and transcribing..'):
101
+ #-- Load whisper model
102
+ start = time.time()
103
+ # Let Whisper handle device placement if possible
104
+ model = whisper.load_model(whisper_model_option, device=device)
105
+ # load_time =
106
+ col_info.info(f'Model loaded in {time.time() - start:.2f} seconds.')
107
+
108
+ #-- Perform transcription
109
+ start = time.time()
110
+ # print('################################')
111
+ # print(st.session_state.audio_path)
112
+ # print('################################')
113
+
114
+ st.session_state.transcript = Transcription(st.session_state.audio_path)
115
+ # st.session_state.transcript = Transcription([audio_path])
116
+ # st.session_state.transcript.transcribe(whisper_model_option)
117
+ # st.markdown(model.name)
118
+ st.session_state.transcript.transcribe(model)
119
+ # result = model.transcribe(audio_path, fp16=(device == 'cuda')) # use fp16 on GPU for speed/memory
120
+ transcribe_time = time.time() - start
121
+
122
+ # st.session_state['transcript'] = result['text']
123
+ # st.session_state['transcript'] = st.session_state.transcript
124
+ # Store segments for timestamping/structuring later
125
+
126
+ # print(len(st.session_state.transcript['segments']))
127
+ # st.session_state['transcript_segments'] = st.session_state.transcript['segments']
128
+
129
+ col_complete.success(f'Transcription complete! (Took {transcribe_time:.2f}s)')
130
+
131
+ col_next.page_link('ui_video.py', label='Next Step: **🖼️ Analyze Video**', icon='➡️')
132
+
133
+ except Exception as e:
134
+ st.error(f'An error occurred during transcription: {e}')
135
+ # Consider unloading model if error occurs to free memory
136
+ if 'model' in locals():
137
+ del model
138
+ if device == 'cuda':
139
+ torch.cuda.empty_cache()
140
+
141
+
142
+ # --- Video Player ---
143
+ with st.expander('**Video Player**', expanded=True):
144
+ col_video, col_segments = st.columns(2)
145
+ col_video.video(st.session_state.video_path, start_time=st.session_state.start_time)
146
+
147
+
148
+ # --- Display Transcript ---
149
+ if 'transcript' in st.session_state and st.session_state['transcript']:
150
+ st.markdown('#### Transcription')
151
+
152
+ output = st.session_state.transcript.output[0]
153
+ # doc = docx.Document()
154
+ avg_confidence_score = 0
155
+ amount_words = 0
156
+ save_dir = str(pathlib.Path(__file__).parent.absolute()) + '/transcripts/'
157
+
158
+ for idx, segment in enumerate(output['segments']):
159
+ for w in output['segments'][idx]['words']:
160
+ amount_words += 1
161
+ avg_confidence_score += w['probability']
162
+
163
+ st.badge(
164
+ f'whisper model: **`{whisper_model_option}`** | ' +
165
+ f'language: **`{output["language"]}`** | ' +
166
+ f'confidence score: **`{round(avg_confidence_score / amount_words, 3)}`**'
167
+ )
168
+ prev_word_end = -1
169
+ text = ""
170
+ html_text = ""
171
+
172
+ # Define the color map
173
+ colors = [(0.6, 0, 0), (1, 0.7, 0), (0, 0.6, 0)]
174
+ cmap = mcolors.LinearSegmentedColormap.from_list('my_colormap', colors)
175
+
176
+
177
+ with st.expander('**TRANSCRIPT**', expanded=True):
178
+ color_coding = st.checkbox(
179
+ 'color coding',
180
+ value=True,
181
+ # key={i},
182
+ help='Цветное кодирование слов в зависимости от вероятности правильного распознавания: от зелёного (хорошо) до красного (плохо)'
183
+ )
184
+
185
+ # https://docs.streamlit.io/develop/api-reference/layout/st.container
186
+ with st.container(height=300, border=False):
187
+ for idx, segment in enumerate(output['segments']):
188
+ for w in output['segments'][idx]['words']:
189
+ # check for pauses in speech longer than 3s
190
+ if pauses and prev_word_end != -1 and w['start'] - prev_word_end >= 3:
191
+ pause = w['start'] - prev_word_end
192
+ pause_int = int(pause)
193
+ html_text += f'{"." * pause_int}{{{pause_int}sec}}'
194
+ text += f'{"." * pause_int}{{{pause_int}sec}}'
195
+ prev_word_end = w['end']
196
+ if (color_coding):
197
+ rgba_color = cmap(w['probability'])
198
+ rgb_color = tuple(round(x * 255)
199
+ for x in rgba_color[:3])
200
+ else:
201
+ rgb_color = (0, 0, 0)
202
+ html_text += f"<span style='color:rgb{rgb_color}'>{w['word']}</span>"
203
+ text += w['word']
204
+ # insert line break if there is a punctuation mark
205
+ if any(c in w['word'] for c in '!?.') and not any(c.isdigit() for c in w['word']):
206
+ html_text += '<br><br>'
207
+ text += '\n\n'
208
+ st.markdown(html_text, unsafe_allow_html=True)
209
+ # doc.add_paragraph(text)
210
+
211
+ # if (translation):
212
+ # with st.expander("English translation"):
213
+ # st.markdown(output["translation"], unsafe_allow_html=True)
214
+
215
+ # # save transcript as docx. in local folder
216
+ # file_name = output['name'] + "-" + whisper_model + \
217
+ # "-" + datetime.today().strftime('%d-%m-%y') + ".docx"
218
+ # doc.save(save_dir + file_name)
219
+
220
+ # bio = io.BytesIO()
221
+ # doc.save(bio)
222
+ # st.download_button(
223
+ # label="Download Transcription",
224
+ # data=bio.getvalue(),
225
+ # file_name=file_name,
226
+ # mime="docx"
227
+ # )
228
+
229
+
230
+ # --- Display Segments with timestamps ---
231
+ # if 'segments' in st.session_state.transcript:
232
+ # with st.expander('Detailed segments (with timestamps)'):
233
+ # st.json(st.session_state.transcript['segments'])
234
+
235
+ format_time = lambda s: str(timedelta(seconds=int(s)))
236
+
237
+ # st.write(st.session_state.transcript.output[0]['segments'])
238
+
239
+
240
+ # https://discuss.streamlit.io/t/replaying-an-audio-file-with-a-timecode-click/48892/9
241
+ # with col_segments.expander('**SEGMENTS**', expanded=True):
242
+ # with col_segments.container('**SEGMENTS**', expanded=True):
243
+ # https://docs.streamlit.io/develop/api-reference/layout/st.container
244
+ with col_segments.container(height=400, border=False):
245
+ # Style buttons as links
246
+ with stylable_container(
247
+ key='link_buttons',
248
+ css_styles='''
249
+ button {
250
+ background: none!important;
251
+ border: none;
252
+ padding: 0!important;
253
+ font-family: arial, sans-serif;
254
+ color: #069;
255
+ cursor: pointer;
256
+ }
257
+ ''',
258
+ ):
259
+ for i, segment in enumerate(st.session_state.transcript.output[0]['segments']):
260
+ start = format_time(segment['start'])
261
+ end = format_time(segment['end'])
262
+ text = segment['text'].strip()
263
+
264
+ # 🕒Segment {i + 1}
265
+ # st.badge(f'**[{start} - {end}]** {text}', color='gray')
266
+ # st.markdown(
267
+ # f':violet-badge[**{start} - {end}**] :gray-badge[{text}]'
268
+ # )
269
+
270
+ col_timecode, col_text = st.columns([1, 5])
271
+ # seg_text = f':violet-badge[**{start} - {end}**] :gray-badge[{text}]'
272
+ if col_timecode.button(f':violet-badge[**{start} – {end}**]', use_container_width=True):
273
+ st.session_state['start_time'] = start
274
+ st.rerun()
275
+
276
+ # col_text.markdown(f':gray-badge[`{text}`]')
277
+ # col_text.write('#')
278
+ # col_text.markdown(f'<div style="text-align: bottom;">:gray-badge[{text}]</div>', unsafe_allow_html=True)
279
+ col_text.text(f'{text}')
280
+ # col_text.badge(text, color='gray')
281
+
282
+
283
+ # else:
284
+ # st.info('Transcript has not been generated yet.')
ui_upload.py ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import time
4
+
5
+ from yt_dlp import YoutubeDL
6
+ import ffmpeg
7
+ import tempfile
8
+
9
+ from utils import (save_uploaded_file, extract_audio,
10
+ download_youtube, get_session_dir,
11
+ cleanup_session_files, get_session_id,
12
+ get_temp_dir, get_features, proc_raw_audio)
13
+
14
+
15
+ st.title('📥📄 Step 1: Upload Video & Preprocess')
16
+
17
+
18
+ # Initialize session state defaults
19
+ defaults = {
20
+ 'uploaded_file': None,
21
+ 'video_path': None,
22
+ 'audio_path': None,
23
+ 'ocr_text': None,
24
+ 'transcript': None,
25
+ 'summary': None,
26
+
27
+ 'main_topic': None,
28
+
29
+ 'input_method': 'Upload',
30
+ 'input_title': None,
31
+
32
+ 'video_input_path': None,
33
+ 'video_url': None,
34
+
35
+ 'audio_wav': None,
36
+ 'audio_file': None,
37
+ }
38
+
39
+ for key, value in defaults.items():
40
+ st.session_state.setdefault(key, value)
41
+
42
+
43
+ # --- Option to clear previous session ---
44
+ st.sidebar.write('Current Session ID:')
45
+ st.sidebar.write(f'`{get_session_id()}`') # session ID for debugging
46
+
47
+ if st.sidebar.button('Start New Session'):
48
+ session_id = get_session_id() # get current ID before clearing
49
+ cleanup_session_files(session_id)
50
+ for key in list(st.session_state.keys()):
51
+ del st.session_state[key] # clear all session state
52
+ st.rerun() # rerun the script to reflect cleared state
53
+
54
+
55
+ # --- Main Topic ---
56
+ st.session_state.main_topic = st.text_input('Enter video topic:', st.session_state.main_topic)
57
+ # st.session_state.main_topic = m
58
+ # col_url, col_start_from = st.columns([5, 2])
59
+ # video_url = col_url.text_input('Enter YouTube video URL:', example_youtube['url'])
60
+ # start_from = col_start_from.number_input(
61
+ # 'Start From:',
62
+ # min_value=0.0, step=0.5, format='%f', value=example_youtube['start'],
63
+ # help='Time shift from the beginning (in seconds)'
64
+ # )
65
+
66
+ # if video_url:
67
+ # st.session_state.video_url = video_url
68
+ # st.session_state.video_input_path = '' # clear path if URL is used
69
+
70
+
71
+
72
+ # --- Video source selection ---
73
+ input_method = st.radio(
74
+ 'Select Input Method:',
75
+ ('Upload', 'YouTube'),
76
+ key='input_method',
77
+ horizontal=True
78
+ )
79
+
80
+ video_path = None
81
+ uploaded_file = None
82
+ video_url = None
83
+
84
+
85
+ if input_method == 'Upload':
86
+ uploaded_file = st.file_uploader(
87
+ 'Choose a video file',
88
+ type=['mp4', 'avi', 'mkv', 'mov']
89
+ )
90
+
91
+ if uploaded_file:
92
+ col_info, col_ready = st.columns(2)
93
+
94
+ # Display basic file info
95
+ col_info.info('**[ File Details ]** ' +
96
+ f'name: `{uploaded_file.name}` | ' +
97
+ f'type: `{uploaded_file.type}` | ' +
98
+ f'size: `{uploaded_file.size / (1024 * 1024):.2f} MB`')
99
+
100
+ # Save uploaded file temporarily for the Prefect flow
101
+ temp_dir = get_temp_dir() # use a shared temp location
102
+ # Use a unique name to avoid conflicts if multiple users run simultaneously
103
+ target_path = os.path.join(temp_dir, f'upload_{get_session_id()}_{uploaded_file.name}')
104
+ try:
105
+ with open(target_path, 'wb') as f:
106
+ f.write(uploaded_file.getbuffer())
107
+ st.session_state.video_input_path = target_path
108
+ st.session_state.video_input_title = uploaded_file.name
109
+ st.session_state.video_url = '' # clear URL if file is uploaded
110
+ st.session_state.transcript = None
111
+ st.session_state.summary = None
112
+ col_ready.info('Ready for processing.')
113
+
114
+ except Exception as e:
115
+ col_ready.error(f'Error saving uploaded file: {e}')
116
+ st.session_state.video_input_path = ''
117
+
118
+
119
+ elif input_method == 'YouTube':
120
+ #-- Obtain audio from YouTube video
121
+ example_youtube = {
122
+ 'title': 'Общественное движение',
123
+ 'url': 'https://www.youtube.com/watch?v=c3bhkrKF6F4',
124
+ 'start': 0.0
125
+ }
126
+
127
+ col_url, col_start_from = st.columns([5, 2])
128
+ video_url = col_url.text_input('Enter YouTube video URL:', example_youtube['url'])
129
+ start_from = col_start_from.number_input(
130
+ 'Start From:',
131
+ min_value=0.0, step=0.5, format='%f', value=example_youtube['start'],
132
+ help='Time shift from the beginning (in seconds)'
133
+ )
134
+
135
+ if video_url:
136
+ st.session_state.video_url = video_url
137
+ st.session_state.video_input_path = '' # clear path if URL is used
138
+
139
+
140
+ @st.cache_resource
141
+ def ui_processed_sound(audio_wav, audio_np):
142
+ '''UI to show sound processing results'''
143
+ st.audio(audio_wav)
144
+ features = get_features(audio_np)
145
+
146
+
147
+ @st.cache_resource
148
+ def extract_videofile(video_file):
149
+ # video_buffer = BytesIO(video_file.read())
150
+ # audio_data = VideoFileClip(video_buffer.name).audio
151
+
152
+ # raw_source = StringIO(video_file.getvalue().decode('utf-8'))
153
+ # raw_source = video_file.getvalue().decode('utf-8')
154
+ # raw_source = video_file.read()
155
+ # raw_source = BytesIO(video_file.getvalue())
156
+
157
+ #-- Get video
158
+ # out, err = (
159
+ # ffmpeg
160
+ # .input(video_file, ss=start_from)
161
+ # .output('temp.mp4', vcodec='copy')
162
+ # .overwrite_output()
163
+ # .run()
164
+ # )
165
+ # st.video('temp.mp4')
166
+
167
+ # video = VideoFileClip(video_file)
168
+ # audio = video.audio
169
+ # audio.write_audiofile('output_audio.mp3')
170
+
171
+ tfile = tempfile.NamedTemporaryFile(delete=False)
172
+ tfile.write(video_file.read())
173
+
174
+ #-- Get audio
175
+ # SAMPLE_RATE = 16000
176
+ audio_data, err = (
177
+ ffmpeg
178
+ .input(tfile.name, ss=start_from)
179
+ .output('pipe:', format='wav')#, acodec='pcm_s16le')
180
+ # .output('pipe:', format='s16le', ac=1, acodec='pcm_s16le', ar=SAMPLE_RATE)
181
+ # .global_args('-nostdin', '-threads', '0')
182
+ .run(capture_stdout=True)
183
+ )
184
+ if err:
185
+ raise RuntimeError(f'Failed to load audio: {err.decode()}')
186
+
187
+ return audio_data
188
+
189
+
190
+ @st.cache_resource
191
+ def extract_youtube(raw_url):
192
+ #-- Get video
193
+ # out, err = (
194
+ # ffmpeg
195
+ # .input(raw_url, ss=start_from)
196
+ # .output('temp.mp4', vcodec='copy')
197
+ # .overwrite_output()
198
+ # .run()
199
+ # )
200
+ # st.video('temp.mp4')
201
+
202
+ #-- Get audio
203
+ # SAMPLE_RATE = 16000
204
+ audio_data, err = (
205
+ ffmpeg
206
+ .input(raw_url, ss=start_from)
207
+ .output('pipe:', format='wav')#, acodec='pcm_s16le')
208
+ # .output('pipe:', format='s16le', ac=1, acodec='pcm_s16le', ar=SAMPLE_RATE)
209
+ .global_args('-nostdin', '-threads', '0')
210
+ .run(capture_stdout=True)
211
+ )
212
+ if err:
213
+ raise RuntimeError(f'Failed to load audio: {err.decode()}')
214
+
215
+ return audio_data
216
+
217
+
218
+
219
+
220
+ # --- Processing Button ---
221
+ if st.button('Process video input',
222
+ type='primary',
223
+ disabled=not (st.session_state.video_input_path or st.session_state.video_url)
224
+ ):
225
+ # Clear previous paths if reprocessing
226
+ st.session_state['video_path'] = None
227
+ st.session_state['audio_path'] = None
228
+
229
+ col_info, col_complete, col_next = st.columns(3)
230
+
231
+ with st.spinner('Processing video input..'):
232
+ if st.session_state['input_method'] == 'Upload' and uploaded_file:
233
+ st.session_state.uploaded_file = uploaded_file
234
+ video = uploaded_file
235
+ # audio_data = extract_videofile(uploaded_file)
236
+
237
+ saved_path = save_uploaded_file(uploaded_file)
238
+ if saved_path:
239
+ st.session_state['video_path'] = saved_path
240
+ col_info.success(f'Video saved temporarily to: {os.path.basename(saved_path)}')
241
+ else:
242
+ col_info.error('Failed to save uploaded file')
243
+
244
+ elif st.session_state['input_method'] == 'YouTube' and video_url:
245
+ try:
246
+ with YoutubeDL({'format': 'best+bestaudio'}) as ydl:
247
+ info = ydl.extract_info(video_url, download=False)
248
+ except Exception as e:
249
+ st.error(e)
250
+ else:
251
+ st.write(f"<small><div style='float: center; text-align: center'>\
252
+ **Title:** [{info['title']}]({video_url})\
253
+ **Duration:** {info['duration']} sec.</div></small>",
254
+ unsafe_allow_html=True)
255
+
256
+ video = video_url
257
+ # audio_data = extract_youtube(info['url'])
258
+ st.session_state.video_input_title = info['title']
259
+
260
+ session_dir = get_session_dir()
261
+ os.makedirs(session_dir, exist_ok=True)
262
+ downloaded_path = download_youtube(video_url, session_dir)
263
+ if downloaded_path and os.path.exists(downloaded_path):
264
+ st.session_state['video_path'] = downloaded_path
265
+ col_info.success(f'YouTube video downloaded: {os.path.basename(downloaded_path)}')
266
+ else:
267
+ col_info.error('Failed to download YouTube video')
268
+
269
+ else:
270
+ st.warning('Please upload a file or provide a YouTube URL')
271
+ st.stop()
272
+
273
+
274
+ # --- Basic Preprocessing: Audio Extraction ---
275
+ if st.session_state['video_path']:
276
+ # st.write('Extracting audio..')
277
+ start = time.time()
278
+ # Ensure utils.extract_audio uses the correct path
279
+ audio_path = extract_audio(st.session_state['video_path'])
280
+ end = time.time()
281
+ if audio_path and os.path.exists(audio_path):
282
+ st.session_state['audio_path'] = audio_path
283
+ col_info.success(f'Audio extracted to: {os.path.basename(audio_path)} (took {end - start:.2f}s)')
284
+ else:
285
+ col_info.error('Failed to extract audio from the video')
286
+ st.warning('Proceeding without audio. STT step will be skipped')
287
+ st.session_state['audio_path'] = None # explicitly set to None
288
+
289
+ if st.session_state['video_path']:
290
+ col_complete.info('Preprocessing complete')
291
+ col_next.page_link('ui_transcribe.py', label='Next Step: 🎙️ **Transcribe**', icon='➡️')
292
+
293
+
294
+ # Display video
295
+ st.subheader('Video Player')
296
+ _, col_video, _ = st.columns([1, 3, 1])
297
+ col_video.video(video)
298
+
299
+ # audio_data = audio_path
300
+ # audio_wav, audio_np = proc_raw_audio(audio_data)
301
+
302
+ # st.session_state.audio_wav = audio_wav
303
+ # st.session_state.audio_np = audio_np
304
+
305
+ # # st.session_state.video = video.read()
306
+
307
+ # ui_processed_sound(audio_wav, audio_np)
308
+
309
+
310
+ # # Display current status
311
+ # st.subheader("Current Status:")
312
+ # if st.session_state.get('video_path'):
313
+ # st.success(f"✅ Video Loaded: {os.path.basename(st.session_state['video_path'])}")
314
+ # else:
315
+ # st.warning("⏳ Video not yet loaded or processed.")
316
+
317
+ # if st.session_state.get('audio_path'):
318
+ # st.success(f"✅ Audio Extracted: {os.path.basename(st.session_state['audio_path'])}")
319
+ # elif st.session_state.get('video_path'): # only show warning if video was loaded but audio failed
320
+ # st.warning("⚠️ Audio extraction failed or video has no audio track.")
ui_video.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import pytesseract
4
+ from PIL import Image
5
+ import time
6
+ from utils import extract_frames_interval, extract_frames_pyscenedetect
7
+
8
+
9
+ st.title('🖼️ Step 3: Video Processing (Frame Extraction & OCR)')
10
+
11
+
12
+ # Check if video path exists
13
+ if ('video_path' not in st.session_state or
14
+ not st.session_state['video_path'] or
15
+ not os.path.exists(st.session_state['video_path'])
16
+ ):
17
+ st.warning('Video file not found. Please go back to the **📤 Upload** page and process a video first.')
18
+ st.stop()
19
+
20
+ video_path = st.session_state['video_path']
21
+ st.write(f'Video file to process: `{os.path.basename(video_path)}`')
22
+
23
+ #
24
+ # ==================================================================
25
+ #
26
+
27
+ col_method, col_config = st.columns(2)
28
+
29
+ # --- Method ---
30
+ # with col_model.expander('**MODEL**', expanded=True):
31
+ with col_method.container(border=True):
32
+ # extraction_method = st.selectbox(
33
+ # 'Extraction method:',
34
+ # ('interval', 'video2slides', 'pyscenedetect'),
35
+ # index=0
36
+ # )
37
+ extraction_method = st.radio(
38
+ 'Extraction method:',
39
+ ('interval', 'video2slides', 'pyscenedetect'),
40
+ index=0,
41
+ horizontal=True,
42
+ )
43
+
44
+ # col_config_frame_interval, col_config_ocr_lang = st.columns(2)
45
+ # frame_interval = col_config_frame_interval.slider('Extract frames every `X` seconds:', min_value=1, max_value=60, value=5, step=1)
46
+ # ocr_lang = col_config_ocr_lang.text_input('OCR Language(s) (e.g. `rus`, `rus+eng`):', value='rus')
47
+ ocr_lang = st.text_input('OCR Language(s) (e.g. `rus`, `rus+eng`):', value='rus')
48
+
49
+ # --- Configuration ---
50
+ with col_config.expander(f'**`{extraction_method}` METHOD CONFIG**', expanded=True):
51
+ match extraction_method:
52
+ case 'interval':
53
+ extraction_interval = st.number_input(
54
+ 'Frames extraction interval:',
55
+ min_value=0, max_value=25, step=1, format='%i', value=5,
56
+ help='Extract frames every `x` seconds'
57
+ )
58
+ case 'video2slides':
59
+ print('video2slides')
60
+ case 'pyscenedetect':
61
+ extraction_threshold = st.number_input(
62
+ 'Frames extraction threshold:',
63
+ min_value=0.1, max_value=10.0, step=0.1, format='%f', value=2.0,
64
+ )
65
+
66
+
67
+ # --- Semantic Segmentation Placeholder ---
68
+ # st.markdown("---")
69
+ # --- Tesseract Configuration (Optional but recommended) ---
70
+ # Uncomment and set the path if tesseract is not in your PATH
71
+ # pytesseract.pytesseract.tesseract_cmd = r'/path/to/your/tesseract' # Example: '/usr/bin/tesseract' or 'C:\Program Files\Tesseract-OCR\tesseract.exe'
72
+
73
+
74
+
75
+
76
+ # # --- Frame Extraction and OCR ---
77
+ # st.subheader('OCR')
78
+
79
+ if st.button('Extract Frames'):
80
+ # st.session_state['ocr_text'] = None # clear previous results
81
+ st.session_state['frames_paths'] = []
82
+ # all_ocr_results = []
83
+
84
+ col_info, col_complete, col_next = st.columns(3)
85
+
86
+ match extraction_method:
87
+ case 'interval':
88
+ with st.spinner(f'Extracting frames every {extraction_interval} seconds (using interval method)..'):
89
+ start_time = time.time()
90
+ frames_dir, frame_paths = extract_frames_interval(video_path, 'frames_pyscenedetect', interval_sec=extraction_interval)
91
+ extract_time = time.time() - start_time
92
+ if frames_dir and frame_paths:
93
+ st.session_state['frames_dir'] = frames_dir
94
+ st.session_state['frames_paths'] = frame_paths # store paths
95
+ col_info.success(f'Extracted {len(frame_paths)} frames in {extract_time:.2f}s.')
96
+ else:
97
+ col_info.error('Failed to extract frames')
98
+ st.stop()
99
+ case 'video2slides':
100
+ pass
101
+ case 'pyscenedetect':
102
+ with st.spinner(f'Extracting frames with `threshold={extraction_threshold}` (using pyscenedetect method)..'):
103
+ start_time = time.time()
104
+ frames_dir, frame_paths = extract_frames_pyscenedetect(video_path, 'frames_pyscenedetect', threshold=extraction_threshold)
105
+ extract_time = time.time() - start_time
106
+ if frames_dir and frame_paths:
107
+ st.session_state['frames_dir'] = frames_dir
108
+ st.session_state['frames_paths'] = frame_paths # store paths
109
+ col_info.success(f'Extracted {len(frame_paths)} frames in {extract_time:.2f}s.')
110
+ else:
111
+ col_info.error('Failed to extract frames')
112
+ st.stop()
113
+
114
+
115
+ if st.session_state['frames_paths']:
116
+ total_frames = len(st.session_state['frames_paths'])
117
+ col_info.write(f'Performing OCR on {total_frames} frames..')
118
+ ocr_progress = st.progress(0)
119
+ start_ocr_time = time.time()
120
+
121
+ extracted_texts = []
122
+ processed_count = 0
123
+
124
+ # Use columns to display some example frames and OCR
125
+ max_display_frames = 6
126
+ display_cols = st.columns(min(max_display_frames, total_frames) if total_frames > 0 else 1)
127
+ display_idx = 0
128
+
129
+ # Process frames in batches or one by one
130
+ for i, frame_path in enumerate(st.session_state['frames_paths']):
131
+ try:
132
+ img = Image.open(frame_path)
133
+ # --- Potential Preprocessing/Filtering ---
134
+ # Add logic here if needed:
135
+ # - Detect if frame likely contains text (e.g., check contrast, edges)
136
+ # - If segmentation was implemented, crop to slide regions here
137
+ # --- Perform OCR ---
138
+ text = pytesseract.image_to_string(img, lang=ocr_lang)
139
+ # --- Basic Text Cleaning/Filtering ---
140
+ cleaned_text = text.strip()
141
+ if cleaned_text and len(cleaned_text) > 10: # filter very short/noisy results
142
+ # Extract timestamp from filename (assuming format frame_XXXXXX.png)
143
+ try:
144
+ secs = int(os.path.basename(frame_path).split('_')[1].split('.')[0])
145
+ timestamp = time.strftime('%H:%M:%S', time.gmtime(secs))
146
+ extracted_texts.append({'timestamp': timestamp, 'text': cleaned_text})
147
+ except:
148
+ extracted_texts.append({'timestamp': 'N/A', 'text': cleaned_text}) # fallback if filename parse fails
149
+
150
+
151
+ # Display some examples
152
+ if display_idx < max_display_frames and display_idx < len(display_cols):
153
+ with display_cols[display_idx]:
154
+ st.image(img, caption=f'Frame (t={timestamp})', use_container_width=True)
155
+ st.text(f'OCR:\n{cleaned_text[:100]}..') # show snippet
156
+ display_idx += 1
157
+
158
+
159
+ processed_count += 1
160
+ ocr_progress.progress(processed_count / total_frames)
161
+
162
+ except Exception as ocr_err:
163
+ col_info.warning(f'Could not perform OCR on {os.path.basename(frame_path)}: {ocr_err}')
164
+ processed_count += 1 # still count as processed
165
+ ocr_progress.progress(processed_count / total_frames)
166
+
167
+ ocr_time = time.time() - start_ocr_time
168
+ col_complete.success(f'OCR processing finished in {ocr_time:.2f}s.')
169
+
170
+ # --- Aggregate and Deduplicate OCR Text ---
171
+ # Simple approach: Combine unique text blocks
172
+ final_ocr_text = ""
173
+ seen_texts = set()
174
+ last_text = ""
175
+ min_similarity_threshold = 0.8 # requires a library like `thefuzz` or similar for proper check
176
+ # basic check: avoid exact consecutive duplicates
177
+
178
+ for item in extracted_texts:
179
+ current_text_block = item['text'].strip()
180
+
181
+ # Basic check: Only add if significantly different from the last block
182
+ # A more robust check would involve sequence matching or fuzzy matching
183
+ is_duplicate = False
184
+ if last_text:
185
+ # Simple check: exact match or near-exact length/content start?
186
+ if (current_text_block == last_text or
187
+ (abs(len(current_text_block) - len(last_text)) < 10 and
188
+ current_text_block.startswith(last_text[:20]))
189
+ ):
190
+ is_duplicate = True # likely a duplicate from consecutive frames
191
+
192
+ if current_text_block and not is_duplicate: # only add non-empty, non-duplicate text
193
+ final_ocr_text += f"\n\n--- Text from frame around {item['timestamp']} ---\n"
194
+ final_ocr_text += current_text_block
195
+ last_text = current_text_block # update last text added
196
+
197
+ st.session_state['ocr_text'] = final_ocr_text.strip()
198
+
199
+ if st.session_state['ocr_text']:
200
+ col_complete.info('OCR processing complete.')
201
+ col_next.page_link('ui_summarize.py', label='Next Step: **📝 Summarize**', icon='➡️')
202
+ else:
203
+ col_complete.warning('No significant text found via OCR')
204
+
205
+
206
+ # --- Display OCR Results ---
207
+ st.subheader('Aggregated OCR Text')
208
+ if 'ocr_text' in st.session_state and st.session_state['ocr_text']:
209
+ st.text_area("Extracted Text from Frames", st.session_state['ocr_text'], height=400)
210
+ else:
211
+ st.info('OCR has not been run or no text was detected')
212
+
213
+
214
+ # st.divider()
215
+
216
+ # st.subheader('Semantic Segmentation')
utils.py ADDED
@@ -0,0 +1,436 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ # import ffmpeg
4
+ from moviepy.video.io.VideoFileClip import VideoFileClip
5
+ import cv2
6
+ import uuid
7
+ import tomllib
8
+ from pathlib import Path
9
+
10
+ import streamlit as st
11
+
12
+ import numpy as np
13
+ from io import BytesIO
14
+
15
+ from pydub import AudioSegment
16
+ from pydub.silence import detect_leading_silence
17
+ import librosa
18
+
19
+ import librosa.display as lbd
20
+ import matplotlib.pyplot as plt
21
+
22
+
23
+ TEMP_DIR = tempfile.mkdtemp()
24
+
25
+
26
+ CONFIG_FILE = 'config.toml'
27
+
28
+
29
+ def load_config():
30
+ """Loads configuration from config.toml"""
31
+ try:
32
+ with open(CONFIG_FILE, 'rb') as f:
33
+ return tomllib.load(f)
34
+ except FileNotFoundError:
35
+ print(f"Error: {CONFIG_FILE} not found. Using default settings.")
36
+ # Provide default fallback config if needed
37
+ return {
38
+ "paths": {"output_dir": "output", "temp_dir": "temp_processing"},
39
+ "models": {"whisper_model": "base.en", "ocr_languages": ["en"], "summarization_model": "google/pegasus-xsum"},
40
+ "settings": {"frame_extraction_interval_seconds": 10, "max_summary_length": 500, "min_summary_length": 100}
41
+ }
42
+ except Exception as e:
43
+ print(f"Error loading config: {e}")
44
+ raise # Re-raise after printing
45
+
46
+
47
+ CONFIG = load_config()
48
+
49
+
50
+ def ensure_dir(directory_path):
51
+ """Creates a directory if it doesn't exist."""
52
+ Path(directory_path).mkdir(parents=True, exist_ok=True)
53
+
54
+
55
+
56
+
57
+ def save_uploaded_file(uploaded_file):
58
+ """Saves an uploaded file to a temporary directory."""
59
+ if uploaded_file is not None:
60
+ # Generate a unique sub-directory for this upload
61
+ session_id = get_session_id() # simple way to group files per session/upload
62
+ upload_dir = os.path.join(TEMP_DIR, session_id)
63
+ os.makedirs(upload_dir, exist_ok=True)
64
+
65
+ file_path = os.path.join(upload_dir, uploaded_file.name)
66
+ with open(file_path, 'wb') as f:
67
+ f.write(uploaded_file.getbuffer())
68
+ print(f'File saved to: {file_path}') # debugging
69
+ return file_path
70
+ return None
71
+
72
+
73
+ def get_session_id():
74
+ """Generates or retrieves a unique session ID."""
75
+ if 'session_id' not in st.session_state:
76
+ st.session_state['session_id'] = str(uuid.uuid4())[:8]
77
+ return st.session_state['session_id']
78
+
79
+
80
+ def get_session_dir():
81
+ """Gets the temporary directory path for the current session."""
82
+ session_id = get_session_id()
83
+ return os.path.join(TEMP_DIR, session_id)
84
+
85
+
86
+ def get_temp_dir():
87
+ """Creates and returns the path to a temporary directory for processing."""
88
+ temp_dir = Path(CONFIG['paths']['temp_dir'])
89
+ ensure_dir(temp_dir)
90
+ # Consider using unique subdirs per run if needed
91
+ # processing_subdir = tempfile.mkdtemp(dir=temp_dir)
92
+ # return processing_subdir
93
+ return str(temp_dir) # Return as string for wider compatibility
94
+
95
+
96
+ def extract_audio(video_path, audio_format="wav"):
97
+ """Extracts audio from video using moviepy."""
98
+ try:
99
+ session_dir = os.path.dirname(video_path) # Assumes video is in session dir
100
+ base_name = os.path.splitext(os.path.basename(video_path))[0]
101
+ audio_filename = f"{base_name}_audio.{audio_format}"
102
+ audio_path = os.path.join(session_dir, audio_filename)
103
+
104
+ if os.path.exists(audio_path):
105
+ print(f"Audio file already exists: {audio_path}")
106
+ return audio_path
107
+
108
+ print(f"Extracting audio from {video_path} to {audio_path}...")
109
+ video_clip = VideoFileClip(video_path)
110
+ audio_clip = video_clip.audio
111
+ if audio_clip is None:
112
+ print("No audio track found in the video.")
113
+ video_clip.close()
114
+ return None
115
+ audio_clip.write_audiofile(audio_path, codec='pcm_s16le' if audio_format == 'wav' else 'mp3') # WAV is often better for STT
116
+ audio_clip.close()
117
+ video_clip.close()
118
+ print("Audio extraction complete.")
119
+ return audio_path
120
+ except Exception as e:
121
+ print(f"Error extracting audio: {e}")
122
+ # Clean up potentially corrupted file
123
+ if 'audio_clip' in locals() and audio_clip:
124
+ audio_clip.close()
125
+ if 'video_clip' in locals() and video_clip:
126
+ video_clip.close()
127
+ # Attempt to remove partial file if creation failed mid-way
128
+ if os.path.exists(audio_path):
129
+ try:
130
+ os.remove(audio_path)
131
+ except OSError as rm_e:
132
+ print(f"Could not remove partial audio file {audio_path}: {rm_e}")
133
+ return None
134
+
135
+
136
+ from scenedetect import open_video, SceneManager
137
+ from scenedetect.detectors import ContentDetector
138
+
139
+
140
+ def extract_frames_pyscenedetect(video_path, output_dir, threshold=2.0):
141
+ # session_dir = os.path.dirname(video_path)
142
+ # frames_dir = os.path.join(session_dir, 'frames_pyscenedetect')
143
+ # os.makedirs(frames_dir, exist_ok=True)
144
+ os.makedirs(output_dir, exist_ok=True) # ensure the output dir exists
145
+
146
+ # Init video- and scene- managers
147
+ # video_manager = VideoManager([video_path])
148
+ video = open_video(video_path)
149
+ scene_manager = SceneManager()
150
+
151
+ scene_manager.add_detector(ContentDetector(threshold=threshold))
152
+
153
+ # Start analysis
154
+ # video_manager.set_downscale_factor()
155
+ # video_manager.start()
156
+ # scene_manager.detect_scenes(frame_source=video_manager)
157
+ scene_manager.detect_scenes(video)
158
+ print(scene_manager.get_scene_list())
159
+
160
+ # Get the scene list
161
+ scene_list = scene_manager.get_scene_list()
162
+ print(f'Обнаружено {len(scene_list)} смен сцен.')
163
+
164
+ # Save the scenes switch frames
165
+ cap = cv2.VideoCapture(video_path)
166
+ if not cap.isOpened():
167
+ print(f'Error: Could not open video file {video_path}')
168
+ return None
169
+
170
+ extracted_frame_paths = []
171
+
172
+ for i, (start_time, _) in enumerate(scene_list):
173
+ frame_num = start_time.get_frames()
174
+ cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
175
+ success, frame = cap.read()
176
+ if success:
177
+ timestamp_ms = cap.get(cv2.CAP_PROP_POS_MSEC)
178
+ # frame_filename = f'scene_{i + 1:03d}.jpg'
179
+ frame_filename = f'frame_{int(timestamp_ms / 1000):06d}.png' # naming by seconds
180
+ frame_path = os.path.join(output_dir, frame_filename)
181
+ cv2.imwrite(frame_path, frame)
182
+ print(f'[*] Сохранён кадр {frame_num} в {frame_path}')
183
+ extracted_frame_paths.append(frame_path)
184
+ else:
185
+ print(f'[!] Ошибка при чтении кадра {frame_num}')
186
+
187
+ cap.release()
188
+ return output_dir, extracted_frame_paths
189
+ print(f'Extracted {len(extracted_frame_paths)} frames to {output_dir}.')
190
+ return output_dir, extracted_frame_paths
191
+
192
+
193
+ def extract_frames_interval(video_path, output_dir, interval_sec=5):
194
+ '''Extracts frames from video at specified intervals using OpenCV.'''
195
+ try:
196
+ # session_dir = os.path.dirname(video_path)
197
+ # frames_dir = os.path.join(session_dir, 'frames_interval')
198
+ # os.makedirs(frames_dir, exist_ok=True)
199
+ os.makedirs(output_dir, exist_ok=True) # ensure the output dir exists
200
+
201
+ print(f'Extracting frames from {video_path} every {interval_sec}s..')
202
+ cap = cv2.VideoCapture(video_path)
203
+ if not cap.isOpened():
204
+ print(f'Error: Could not open video file {video_path}')
205
+ return None
206
+
207
+ fps = cap.get(cv2.CAP_PROP_FPS)
208
+ if fps == 0:
209
+ print('Warning: Could not get FPS, defaulting to 30.')
210
+ fps = 30 # provide a default if FPS is not available
211
+
212
+ frame_interval = int(fps * interval_sec)
213
+ frame_count = 0
214
+ extracted_frame_paths = []
215
+
216
+ def extract_frame():
217
+ timestamp_ms = cap.get(cv2.CAP_PROP_POS_MSEC)
218
+ frame_filename = f'frame_{int(timestamp_ms / 1000):06d}.png' # naming by seconds
219
+ frame_path = os.path.join(output_dir, frame_filename)
220
+ cv2.imwrite(frame_path, frame)
221
+ extracted_frame_paths.append(frame_path)
222
+
223
+ success = True
224
+ while success:
225
+ if frame_count % frame_interval == 0:
226
+ success, frame = cap.read()
227
+ if success:
228
+ extract_frame()
229
+ else:
230
+ # Skip frames efficiently without decoding
231
+ for _ in range(frame_interval - 1):
232
+ success = cap.grab()
233
+ if not success:
234
+ break
235
+ frame_count += 1
236
+ # Now read the desired frame if grab was successful
237
+ if success:
238
+ success, frame = cap.retrieve()
239
+ if success:
240
+ extract_frame()
241
+ else:
242
+ # Handle case where retrieve fails after grab
243
+ print(f'Warning: Failed to retrieve frame after grab at frame count {frame_count}')
244
+
245
+ frame_count += 1
246
+
247
+ cap.release()
248
+ print(f'Extracted {len(extracted_frame_paths)} frames to {output_dir}.')
249
+ return output_dir, extracted_frame_paths
250
+ except Exception as e:
251
+ print(f'Error extracting frames: {e}')
252
+ if 'cap' in locals() and cap.isOpened():
253
+ cap.release()
254
+ return None, []
255
+
256
+
257
+ # --- Add other potential helpers: yt-dlp download, file cleanup etc. ---
258
+ def download_youtube(url, output_dir):
259
+ """Downloads YouTube video using yt-dlp."""
260
+ import yt_dlp
261
+ ydl_opts = {
262
+ 'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
263
+ 'outtmpl': os.path.join(output_dir, '%(title)s.%(ext)s'),
264
+ 'noplaylist': True, # download only single video if URL is part of playlist
265
+ 'progress_hooks': [lambda d: print(d['status'])] # basic progress
266
+ }
267
+ try:
268
+ print(f'Attempting to download YouTube video: {url}')
269
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
270
+ info = ydl.extract_info(url, download=True)
271
+ # Try to get the downloaded filename
272
+ filename = ydl.prepare_filename(info)
273
+ print(f"YouTube video downloaded to: {filename}")
274
+ return filename
275
+ except Exception as e:
276
+ print(f"Error downloading YouTube video: {e}")
277
+ return None
278
+
279
+
280
+ def cleanup_session_files(session_id):
281
+ """Removes the temporary directory for a given session."""
282
+ session_dir = os.path.join(TEMP_DIR, session_id)
283
+ if os.path.exists(session_dir):
284
+ import shutil
285
+ try:
286
+ shutil.rmtree(session_dir)
287
+ print(f"Cleaned up temporary files for session: {session_id}")
288
+ except Exception as e:
289
+ print(f"Error cleaning up session files {session_dir}: {e}")
290
+
291
+
292
+
293
+
294
+ ###
295
+ ###=== Audio Loading and Processing
296
+ ###
297
+
298
+
299
+ SAMPLE_RATE = 22050
300
+ DURATION = 5
301
+
302
+ n_mfcc = 13 # number of MFCCs to extract from each sample
303
+ n_mels = 128
304
+
305
+ n_fft = 2048
306
+ hop_length = 512
307
+
308
+ delta_width = 9 # MFCC Delta parameter
309
+
310
+
311
+ def trim_silence(sound, s_thresh=-28.0):
312
+ '''Trims silent chunks from beginning and end of the sound'''
313
+ duration = len(sound)
314
+
315
+ start_trim = detect_leading_silence(sound, s_thresh)
316
+ end_trim = detect_leading_silence(sound.reverse(), s_thresh)
317
+
318
+ start = start_trim if start_trim != duration else None
319
+ end = duration - end_trim if end_trim != duration else None
320
+
321
+ return sound[start:end]
322
+
323
+
324
+ def normalize_volume(sound, target_dBFS=-20.0):
325
+ '''Normalizes sound and shifts to specified loudness'''
326
+ sound = sound.normalize()
327
+ difference = target_dBFS - sound.dBFS
328
+ return sound.apply_gain(difference)
329
+
330
+
331
+ def proc_raw_audio(audio_data, from_start=0, duration=None, before_end=0):
332
+ '''Processes raw audio data and return wav and numpy arrays'''
333
+ # Instanciate pydub AudioSegment object from raw audio
334
+ audioObj = AudioSegment.from_file(BytesIO(audio_data))
335
+
336
+ # Convert to mono mode with the desired sample rate
337
+ audioObj = audioObj.set_frame_rate(SAMPLE_RATE).set_channels(1)
338
+ # Normalize audio volume
339
+ audioObj = normalize_volume(audioObj)
340
+ # Trim by removing silence from beginning and end of the sound
341
+ audioObj = trim_silence(audioObj)
342
+
343
+ # Cut to the desired duration
344
+ start = from_start * 1000
345
+ if duration:
346
+ end = start + duration * 1000
347
+ else:
348
+ end = len(audioObj) - before_end * 1000
349
+ audioObj = audioObj[start:end]
350
+
351
+ # Convert AudioSegment to wav format instance
352
+ buf = BytesIO()
353
+ audioObj.export(buf, format='wav')
354
+ audio_wav = buf.getvalue()
355
+
356
+ # Convert the AudioSegment to signal in form of numpy.array
357
+ arr = audioObj.get_array_of_samples()
358
+ audio_np = np.array(arr, dtype='float')
359
+
360
+ # Normalize if specified
361
+ # if normalized:
362
+ # audio_np = np.array(arr) / np.iinfo(arr.typecode).max
363
+ # y /= np.linalg.norm(y)
364
+ # return y, sample_rate
365
+
366
+ return audio_wav, audio_np
367
+
368
+
369
+ ###==============================================
370
+
371
+
372
+ def obtain_features(y, sr=22050, duration=5, delta_width=9):
373
+ '''Extracts sound features from given signal and returns them as a numpy array'''
374
+ # --- MFCC (returns M: np.ndarray [shape=(n_mfcc, t)])
375
+ mfcc = librosa.feature.mfcc(y, sr,
376
+ n_mfcc=n_mfcc, n_mels=n_mels,
377
+ n_fft=n_fft, hop_length=hop_length)
378
+
379
+ return mfcc
380
+
381
+
382
+ def create_features_array(mfcc):#, mfcc_delta1, mfcc_delta2, spectr_c, spectr_r):
383
+ '''Creates wholistic numpy array of means and variances out of given features'''
384
+ make_meanvar = lambda mean, var: [item for mv in zip(mean, var) for item in mv]
385
+
386
+ mean_var_ops = [
387
+ (mfcc.mean(axis=1), mfcc.var(axis=1))
388
+ ]
389
+
390
+ mfcc_meanvars = sum([make_meanvar(mean, var)
391
+ for mean, var in mean_var_ops], [])
392
+
393
+ # features_array = mfcc_meanvars + spectr_meanvars
394
+ features_array = [mfcc_meanvars]
395
+
396
+ return features_array
397
+
398
+ # def get_features(y, sr=22050, duration=5, delta_width=9):
399
+ # '''Returns numpy array of sound features obtained from signal'''
400
+ # return create_features_array(*obtain_features(y, sr, duration, delta_width))
401
+
402
+
403
+ def get_features(y, duration=5, sr=SAMPLE_RATE):
404
+ '''Returns numpy array of sound features obtained from signal'''
405
+
406
+ fig, axes = plt.subplots(1, 2, figsize=(24, 2))
407
+
408
+ # WAVE PLOT
409
+ axes[0].set_title(f'Wave Plot for audio sample at {sr} hz')
410
+ axes[0].set_facecolor('#B4E8CF')
411
+ lbd.waveshow(y, sr=sr, color='#4300FF', ax=axes[0])
412
+
413
+ # MELSPEC
414
+ melspec = librosa.feature.melspectrogram(y=y, sr=sr)
415
+ melspec = librosa.power_to_db(np.abs(melspec), ref=np.max)
416
+ axes[1].set_title(f'Mel Spectogram | shape: {melspec.shape}')
417
+ lbd.specshow(melspec, cmap='viridis', y_axis='mel', x_axis='time', ax=axes[1])
418
+
419
+ st.pyplot(fig)
420
+
421
+ pad_signal = lambda s, v: np.pad(
422
+ s,
423
+ [(0, 0), (0, max(0, 216 - s.shape[1]))],
424
+ constant_values=v
425
+ )
426
+
427
+ # Prepare melspec for use
428
+ melspec = pad_signal(melspec, melspec.min())
429
+ melspec = melspec.reshape(1, *melspec.shape)
430
+
431
+ # MFCC
432
+ # mfcc = create_features_array(obtain_features(y, sr, duration, delta_width))
433
+ # mfcc = np.array(mfcc).reshape(1, -1)
434
+
435
+ return melspec
436
+ # return mfcc